Initial Julia implementation
This commit is contained in:
parent
25e021caa3
commit
b3efa6af67
17
.github/workflows/main.yaml
vendored
17
.github/workflows/main.yaml
vendored
@ -3,6 +3,23 @@ on: [push, pull_request]
|
|||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
test-julia:
|
||||||
|
runs-on: ubuntu-18.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Setup project
|
||||||
|
run: julia --project -e 'import Pkg; Pkg.instantiate()'
|
||||||
|
- name: Test run PlainStream.jl
|
||||||
|
run: julia --project src/PlainStream.jl --arraysize 100
|
||||||
|
- name: Test run ThreadedStream.jl
|
||||||
|
run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 100
|
||||||
|
- name: Test run DistributedStream.jl
|
||||||
|
run: julia -p2 --project src/DistributedStream.jl --arraysize 100
|
||||||
|
- name: Test run CUDAStream.jl
|
||||||
|
run: julia --project src/CUDAStream.jl --list
|
||||||
|
- name: Test run AMDGPUStream.jl
|
||||||
|
run: julia --project src/AMDGPUStream.jl --list
|
||||||
|
|
||||||
test:
|
test:
|
||||||
runs-on: ubuntu-18.04
|
runs-on: ubuntu-18.04
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
2
JuliaStream.jl/.JuliaFormatter.toml
Normal file
2
JuliaStream.jl/.JuliaFormatter.toml
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
indent = 2
|
||||||
|
margin = 100
|
||||||
5
JuliaStream.jl/.gitignore
vendored
Normal file
5
JuliaStream.jl/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
*.jl.cov
|
||||||
|
*.jl.*.cov
|
||||||
|
*.jl.mem
|
||||||
|
/docs/build/
|
||||||
|
/docs/Manifest.toml
|
||||||
411
JuliaStream.jl/Manifest.toml
Normal file
411
JuliaStream.jl/Manifest.toml
Normal file
@ -0,0 +1,411 @@
|
|||||||
|
# This file is machine-generated - editing it directly is not advised
|
||||||
|
|
||||||
|
[[AMDGPU]]
|
||||||
|
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"]
|
||||||
|
git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9"
|
||||||
|
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
|
||||||
|
version = "0.2.7"
|
||||||
|
|
||||||
|
[[AbstractFFTs]]
|
||||||
|
deps = ["LinearAlgebra"]
|
||||||
|
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
|
||||||
|
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
|
||||||
|
version = "1.0.1"
|
||||||
|
|
||||||
|
[[Adapt]]
|
||||||
|
deps = ["LinearAlgebra"]
|
||||||
|
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
|
||||||
|
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||||
|
version = "3.3.1"
|
||||||
|
|
||||||
|
[[ArgParse]]
|
||||||
|
deps = ["Logging", "TextWrap"]
|
||||||
|
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
|
||||||
|
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||||
|
version = "1.1.4"
|
||||||
|
|
||||||
|
[[ArgTools]]
|
||||||
|
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
|
||||||
|
|
||||||
|
[[Artifacts]]
|
||||||
|
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
|
||||||
|
|
||||||
|
[[BFloat16s]]
|
||||||
|
deps = ["LinearAlgebra", "Test"]
|
||||||
|
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
|
||||||
|
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
|
||||||
|
version = "0.1.0"
|
||||||
|
|
||||||
|
[[Base64]]
|
||||||
|
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||||
|
|
||||||
|
[[BinaryProvider]]
|
||||||
|
deps = ["Libdl", "Logging", "SHA"]
|
||||||
|
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
|
||||||
|
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
|
||||||
|
version = "0.5.10"
|
||||||
|
|
||||||
|
[[Bzip2_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "c3598e525718abcc440f69cc6d5f60dda0a1b61e"
|
||||||
|
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
|
||||||
|
version = "1.0.6+5"
|
||||||
|
|
||||||
|
[[CEnum]]
|
||||||
|
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
|
||||||
|
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
|
||||||
|
version = "0.4.1"
|
||||||
|
|
||||||
|
[[CUDA]]
|
||||||
|
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
|
||||||
|
git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad"
|
||||||
|
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||||
|
version = "3.2.1"
|
||||||
|
|
||||||
|
[[ChainRulesCore]]
|
||||||
|
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
|
||||||
|
git-tree-sha1 = "8b31cc69cbc38c5c826aaa1c890c694be3622d99"
|
||||||
|
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||||
|
version = "0.10.3"
|
||||||
|
|
||||||
|
[[Compat]]
|
||||||
|
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
|
||||||
|
git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab"
|
||||||
|
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
|
||||||
|
version = "3.30.0"
|
||||||
|
|
||||||
|
[[CompilerSupportLibraries_jll]]
|
||||||
|
deps = ["Artifacts", "Libdl"]
|
||||||
|
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
||||||
|
|
||||||
|
[[ConstructionBase]]
|
||||||
|
deps = ["LinearAlgebra"]
|
||||||
|
git-tree-sha1 = "1dc43957fb9a1574fa1b7a449e101bd1fd3a9fb7"
|
||||||
|
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
|
||||||
|
version = "1.2.1"
|
||||||
|
|
||||||
|
[[DataStructures]]
|
||||||
|
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
|
||||||
|
git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
|
||||||
|
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||||
|
version = "0.18.9"
|
||||||
|
|
||||||
|
[[Dates]]
|
||||||
|
deps = ["Printf"]
|
||||||
|
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||||
|
|
||||||
|
[[DelimitedFiles]]
|
||||||
|
deps = ["Mmap"]
|
||||||
|
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
||||||
|
|
||||||
|
[[Distributed]]
|
||||||
|
deps = ["Random", "Serialization", "Sockets"]
|
||||||
|
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||||
|
|
||||||
|
[[DocStringExtensions]]
|
||||||
|
deps = ["LibGit2"]
|
||||||
|
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
|
||||||
|
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
|
||||||
|
version = "0.8.5"
|
||||||
|
|
||||||
|
[[Downloads]]
|
||||||
|
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
|
||||||
|
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
|
||||||
|
|
||||||
|
[[Elfutils_jll]]
|
||||||
|
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
|
||||||
|
git-tree-sha1 = "76cbf1134983cfb371ad77117bb2659600ed64d6"
|
||||||
|
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
|
||||||
|
version = "0.179.0+0"
|
||||||
|
|
||||||
|
[[ExprTools]]
|
||||||
|
git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
|
||||||
|
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
|
||||||
|
version = "0.1.3"
|
||||||
|
|
||||||
|
[[Future]]
|
||||||
|
deps = ["Random"]
|
||||||
|
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
|
||||||
|
|
||||||
|
[[GPUArrays]]
|
||||||
|
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
|
||||||
|
git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086"
|
||||||
|
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||||
|
version = "6.4.1"
|
||||||
|
|
||||||
|
[[GPUCompiler]]
|
||||||
|
deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
|
||||||
|
git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a"
|
||||||
|
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
|
||||||
|
version = "0.11.5"
|
||||||
|
|
||||||
|
[[InteractiveUtils]]
|
||||||
|
deps = ["Markdown"]
|
||||||
|
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||||
|
|
||||||
|
[[JLLWrappers]]
|
||||||
|
deps = ["Preferences"]
|
||||||
|
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
|
||||||
|
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
|
||||||
|
version = "1.3.0"
|
||||||
|
|
||||||
|
[[LLVM]]
|
||||||
|
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
|
||||||
|
git-tree-sha1 = "b499c68a45249b0385585c62f4a9b62b5db8e691"
|
||||||
|
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||||
|
version = "3.7.1"
|
||||||
|
|
||||||
|
[[LazyArtifacts]]
|
||||||
|
deps = ["Artifacts", "Pkg"]
|
||||||
|
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
|
||||||
|
|
||||||
|
[[LibCURL]]
|
||||||
|
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||||
|
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
|
||||||
|
|
||||||
|
[[LibCURL_jll]]
|
||||||
|
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
|
||||||
|
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
|
||||||
|
|
||||||
|
[[LibGit2]]
|
||||||
|
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
|
||||||
|
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||||
|
|
||||||
|
[[LibSSH2_jll]]
|
||||||
|
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
|
||||||
|
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
|
||||||
|
|
||||||
|
[[Libdl]]
|
||||||
|
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||||
|
|
||||||
|
[[LinearAlgebra]]
|
||||||
|
deps = ["Libdl"]
|
||||||
|
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
|
|
||||||
|
[[LogExpFunctions]]
|
||||||
|
deps = ["DocStringExtensions", "LinearAlgebra"]
|
||||||
|
git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885"
|
||||||
|
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
|
||||||
|
version = "0.2.4"
|
||||||
|
|
||||||
|
[[Logging]]
|
||||||
|
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||||
|
|
||||||
|
[[MacroTools]]
|
||||||
|
deps = ["Markdown", "Random"]
|
||||||
|
git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
|
||||||
|
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||||
|
version = "0.5.6"
|
||||||
|
|
||||||
|
[[Markdown]]
|
||||||
|
deps = ["Base64"]
|
||||||
|
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||||
|
|
||||||
|
[[MbedTLS_jll]]
|
||||||
|
deps = ["Artifacts", "Libdl"]
|
||||||
|
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
|
||||||
|
|
||||||
|
[[Memoize]]
|
||||||
|
deps = ["MacroTools"]
|
||||||
|
git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
|
||||||
|
uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
|
||||||
|
version = "0.4.4"
|
||||||
|
|
||||||
|
[[Mmap]]
|
||||||
|
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||||
|
|
||||||
|
[[MozillaCACerts_jll]]
|
||||||
|
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
|
||||||
|
|
||||||
|
[[NUMA_jll]]
|
||||||
|
deps = ["Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
|
||||||
|
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
|
||||||
|
version = "2.0.13+1"
|
||||||
|
|
||||||
|
[[NetworkOptions]]
|
||||||
|
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
|
||||||
|
|
||||||
|
[[OpenSpecFun_jll]]
|
||||||
|
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
|
||||||
|
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
||||||
|
version = "0.5.5+0"
|
||||||
|
|
||||||
|
[[OrderedCollections]]
|
||||||
|
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
|
||||||
|
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||||
|
version = "1.4.1"
|
||||||
|
|
||||||
|
[[Parameters]]
|
||||||
|
deps = ["OrderedCollections", "UnPack"]
|
||||||
|
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
|
||||||
|
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||||
|
version = "0.12.2"
|
||||||
|
|
||||||
|
[[Pkg]]
|
||||||
|
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||||
|
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||||
|
|
||||||
|
[[Preferences]]
|
||||||
|
deps = ["TOML"]
|
||||||
|
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
|
||||||
|
uuid = "21216c6a-2e73-6563-6e65-726566657250"
|
||||||
|
version = "1.2.2"
|
||||||
|
|
||||||
|
[[Printf]]
|
||||||
|
deps = ["Unicode"]
|
||||||
|
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||||
|
|
||||||
|
[[REPL]]
|
||||||
|
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
|
||||||
|
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||||
|
|
||||||
|
[[Random]]
|
||||||
|
deps = ["Serialization"]
|
||||||
|
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||||
|
|
||||||
|
[[Random123]]
|
||||||
|
deps = ["Libdl", "Random", "RandomNumbers"]
|
||||||
|
git-tree-sha1 = "7c6710c8198fd4444b5eb6a3840b7d47bd3593c5"
|
||||||
|
uuid = "74087812-796a-5b5d-8853-05524746bad3"
|
||||||
|
version = "1.3.1"
|
||||||
|
|
||||||
|
[[RandomNumbers]]
|
||||||
|
deps = ["Random", "Requires"]
|
||||||
|
git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f"
|
||||||
|
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
|
||||||
|
version = "1.4.0"
|
||||||
|
|
||||||
|
[[Reexport]]
|
||||||
|
git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220"
|
||||||
|
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
|
||||||
|
version = "1.1.0"
|
||||||
|
|
||||||
|
[[Requires]]
|
||||||
|
deps = ["UUIDs"]
|
||||||
|
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
|
||||||
|
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
||||||
|
version = "1.1.3"
|
||||||
|
|
||||||
|
[[SHA]]
|
||||||
|
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||||
|
|
||||||
|
[[Scratch]]
|
||||||
|
deps = ["Dates"]
|
||||||
|
git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda"
|
||||||
|
uuid = "6c6a2e73-6563-6170-7368-637461726353"
|
||||||
|
version = "1.1.0"
|
||||||
|
|
||||||
|
[[Serialization]]
|
||||||
|
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
||||||
|
|
||||||
|
[[Setfield]]
|
||||||
|
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
|
||||||
|
git-tree-sha1 = "d5640fc570fb1b6c54512f0bd3853866bd298b3e"
|
||||||
|
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
|
||||||
|
version = "0.7.0"
|
||||||
|
|
||||||
|
[[SharedArrays]]
|
||||||
|
deps = ["Distributed", "Mmap", "Random", "Serialization"]
|
||||||
|
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
|
||||||
|
|
||||||
|
[[Sockets]]
|
||||||
|
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
||||||
|
|
||||||
|
[[SparseArrays]]
|
||||||
|
deps = ["LinearAlgebra", "Random"]
|
||||||
|
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||||
|
|
||||||
|
[[SpecialFunctions]]
|
||||||
|
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
|
||||||
|
git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49"
|
||||||
|
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||||
|
version = "1.5.1"
|
||||||
|
|
||||||
|
[[Statistics]]
|
||||||
|
deps = ["LinearAlgebra", "SparseArrays"]
|
||||||
|
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||||
|
|
||||||
|
[[TOML]]
|
||||||
|
deps = ["Dates"]
|
||||||
|
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||||
|
|
||||||
|
[[Tar]]
|
||||||
|
deps = ["ArgTools", "SHA"]
|
||||||
|
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
|
||||||
|
|
||||||
|
[[Test]]
|
||||||
|
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
|
||||||
|
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||||
|
|
||||||
|
[[TextWrap]]
|
||||||
|
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
|
||||||
|
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
|
||||||
|
version = "1.0.1"
|
||||||
|
|
||||||
|
[[TimerOutputs]]
|
||||||
|
deps = ["ExprTools", "Printf"]
|
||||||
|
git-tree-sha1 = "bf8aacc899a1bd16522d0350e1e2310510d77236"
|
||||||
|
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
|
||||||
|
version = "0.5.9"
|
||||||
|
|
||||||
|
[[UUIDs]]
|
||||||
|
deps = ["Random", "SHA"]
|
||||||
|
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
||||||
|
|
||||||
|
[[UnPack]]
|
||||||
|
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
|
||||||
|
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
|
||||||
|
version = "1.0.2"
|
||||||
|
|
||||||
|
[[Unicode]]
|
||||||
|
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||||
|
|
||||||
|
[[XZ_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "9f76853ea2ba894054e24640abfb73d73e5a4cb5"
|
||||||
|
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
|
||||||
|
version = "5.2.5+0"
|
||||||
|
|
||||||
|
[[Zlib_jll]]
|
||||||
|
deps = ["Libdl"]
|
||||||
|
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||||
|
|
||||||
|
[[argp_standalone_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "c4fa3457046fc93249b63e8319e743b6c8590609"
|
||||||
|
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
|
||||||
|
version = "1.3.0+0"
|
||||||
|
|
||||||
|
[[fts_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
|
||||||
|
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
|
||||||
|
version = "1.2.7+1"
|
||||||
|
|
||||||
|
[[hsa_rocr_jll]]
|
||||||
|
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
|
||||||
|
git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d"
|
||||||
|
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
|
||||||
|
version = "3.7.0+1"
|
||||||
|
|
||||||
|
[[hsakmt_roct_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
|
||||||
|
git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041"
|
||||||
|
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
|
||||||
|
version = "3.8.0+0"
|
||||||
|
|
||||||
|
[[nghttp2_jll]]
|
||||||
|
deps = ["Artifacts", "Libdl"]
|
||||||
|
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||||
|
|
||||||
|
[[obstack_jll]]
|
||||||
|
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
|
||||||
|
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
|
||||||
|
version = "1.2.2+0"
|
||||||
|
|
||||||
|
[[p7zip_jll]]
|
||||||
|
deps = ["Artifacts", "Libdl"]
|
||||||
|
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
|
||||||
14
JuliaStream.jl/Project.toml
Normal file
14
JuliaStream.jl/Project.toml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
name = "JuliaStream"
|
||||||
|
uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681"
|
||||||
|
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
|
||||||
|
version = "3.4.0"
|
||||||
|
|
||||||
|
[deps]
|
||||||
|
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
|
||||||
|
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||||
|
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||||
|
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||||
|
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||||
|
|
||||||
|
[compat]
|
||||||
|
julia = "1.6"
|
||||||
30
JuliaStream.jl/README.md
Normal file
30
JuliaStream.jl/README.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
JuliaStream.jl
|
||||||
|
==============
|
||||||
|
|
||||||
|
This is an implementation of BabelStream in Julia which contains the following variants:
|
||||||
|
|
||||||
|
* `PlainStream.jl` - Single threaded `for`
|
||||||
|
* `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros
|
||||||
|
* `DistributedStream.jl` - Process based parallelism with `@distributed` macros
|
||||||
|
* `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
|
||||||
|
* `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
|
||||||
|
|
||||||
|
### Build & Run
|
||||||
|
|
||||||
|
Prerequisites
|
||||||
|
|
||||||
|
* Julia 1.6+
|
||||||
|
|
||||||
|
With Julia on path, run the benchmark with:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
> cd JuliaStream.jl
|
||||||
|
> julia --project -e 'import Pkg; Pkg.instantiate()' # only required on first run
|
||||||
|
> julia --project src/<IMPL>Stream.jl
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important:**
|
||||||
|
* Julia is 1-indexed, so N > 1 in `--device N`
|
||||||
|
* Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1
|
||||||
|
* You must *prepend* the number of processes needed for `DistributedStream`, e.g `julia -p$(nproc) --project src/DistributedStream.jl`
|
||||||
|
* Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform.
|
||||||
178
JuliaStream.jl/src/AMDGPUStream.jl
Normal file
178
JuliaStream.jl/src/AMDGPUStream.jl
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs
|
||||||
|
include("Stream.jl")
|
||||||
|
using AMDGPU
|
||||||
|
|
||||||
|
const ROCData = StreamData{T,ROCArray{T}} where {T}
|
||||||
|
const TBSize = 1024::Int
|
||||||
|
const DotBlocks = 256::Int
|
||||||
|
|
||||||
|
# AMDGPU.agents()'s internal iteration order isn't stable
|
||||||
|
function gpu_agents_in_repr_order()
|
||||||
|
# XXX if we select anything other than :gpu, we get
|
||||||
|
# HSA_STATUS_ERROR_INVALID_AGENT on the first kernel submission
|
||||||
|
sort(AMDGPU.get_agents(:gpu), by = repr)
|
||||||
|
end
|
||||||
|
|
||||||
|
function devices()
|
||||||
|
try
|
||||||
|
map(repr, gpu_agents_in_repr_order())
|
||||||
|
catch
|
||||||
|
# probably unsupported
|
||||||
|
[]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function gridsize(data::ROCData{T})::Int where {T}
|
||||||
|
return data.size
|
||||||
|
end
|
||||||
|
|
||||||
|
function make_stream(
|
||||||
|
arraysize::Int,
|
||||||
|
scalar::T,
|
||||||
|
device::Int,
|
||||||
|
silent::Bool,
|
||||||
|
)::ROCData{T} where {T}
|
||||||
|
|
||||||
|
if arraysize % TBSize != 0
|
||||||
|
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
|
||||||
|
end
|
||||||
|
|
||||||
|
# XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!()
|
||||||
|
# but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand
|
||||||
|
AMDGPU.DEFAULT_AGENT[] = gpu_agents_in_repr_order()[device]
|
||||||
|
|
||||||
|
data = ROCData{T}(
|
||||||
|
ROCArray{T}(undef, arraysize),
|
||||||
|
ROCArray{T}(undef, arraysize),
|
||||||
|
ROCArray{T}(undef, arraysize),
|
||||||
|
scalar,
|
||||||
|
arraysize,
|
||||||
|
)
|
||||||
|
selected = AMDGPU.get_default_agent()
|
||||||
|
if !silent
|
||||||
|
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
|
||||||
|
println("Kernel parameters : <<<$(gridsize(data)),$(TBSize)>>>")
|
||||||
|
end
|
||||||
|
return data
|
||||||
|
end
|
||||||
|
|
||||||
|
function hard_wait(kernel)
|
||||||
|
# soft wait causes HSA_REFCOUNT overflow issues
|
||||||
|
AMDGPU.wait(kernel, soft = false)
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_arrays!(data::ROCData{T}, init::Tuple{T,T,T}) where {T}
|
||||||
|
AMDGPU.fill!(data.a, init[1])
|
||||||
|
AMDGPU.fill!(data.b, init[2])
|
||||||
|
AMDGPU.fill!(data.c, init[3])
|
||||||
|
end
|
||||||
|
|
||||||
|
function copy!(data::ROCData{T}) where {T}
|
||||||
|
function kernel(a, c)
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
@inbounds c[i] = a[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
hard_wait(@roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.c))
|
||||||
|
end
|
||||||
|
|
||||||
|
function mul!(data::ROCData{T}) where {T}
|
||||||
|
function kernel(b, c, scalar)
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
@inbounds b[i] = scalar * c[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
hard_wait(
|
||||||
|
@roc groupsize = TBSize gridsize = gridsize(data) kernel(data.b, data.c, data.scalar)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function add!(data::ROCData{T}) where {T}
|
||||||
|
function kernel(a, b, c)
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
@inbounds c[i] = a[i] + b[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
hard_wait(
|
||||||
|
@roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.b, data.c)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function triad!(data::ROCData{T}) where {T}
|
||||||
|
function kernel(a, b, c, scalar)
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
@inbounds a[i] = b[i] + (scalar * c[i])
|
||||||
|
return
|
||||||
|
end
|
||||||
|
hard_wait(
|
||||||
|
@roc groupsize = TBSize gridsize = gridsize(data) kernel(
|
||||||
|
data.a,
|
||||||
|
data.b,
|
||||||
|
data.c,
|
||||||
|
data.scalar,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function nstream!(data::ROCData{T}) where {T}
|
||||||
|
function kernel(a, b, c, scalar)
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
@inbounds a[i] += b[i] + scalar * c[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
hard_wait(
|
||||||
|
@roc groupsize = TBSize gridsize = gridsize(data) kernel(
|
||||||
|
data.a,
|
||||||
|
data.b,
|
||||||
|
data.c,
|
||||||
|
data.scalar,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function dot(data::ROCData{T}) where {T}
|
||||||
|
function kernel(a, b, size, partial)
|
||||||
|
tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize))
|
||||||
|
local_i = workitemIdx().x
|
||||||
|
@inbounds tb_sum[local_i] = 0.0
|
||||||
|
|
||||||
|
# do dot first
|
||||||
|
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
|
||||||
|
while i <= size
|
||||||
|
@inbounds tb_sum[local_i] += a[i] * b[i]
|
||||||
|
i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here
|
||||||
|
end
|
||||||
|
|
||||||
|
# then tree reduction
|
||||||
|
offset = workgroupDim().x ÷ 2
|
||||||
|
while offset > 0
|
||||||
|
sync_workgroup()
|
||||||
|
if (local_i - 1) < offset
|
||||||
|
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
|
||||||
|
end
|
||||||
|
offset ÷= 2
|
||||||
|
end
|
||||||
|
|
||||||
|
if (local_i == 1)
|
||||||
|
@inbounds partial[workgroupIdx().x] = tb_sum[local_i]
|
||||||
|
end
|
||||||
|
|
||||||
|
return
|
||||||
|
end
|
||||||
|
partial_sum = ROCArray{T}(undef, DotBlocks)
|
||||||
|
hard_wait(
|
||||||
|
@roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel(
|
||||||
|
data.a,
|
||||||
|
data.b,
|
||||||
|
data.size,
|
||||||
|
partial_sum,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return sum(partial_sum)
|
||||||
|
end
|
||||||
|
|
||||||
|
function read_data(data::ROCData{T})::VectorData{T} where {T}
|
||||||
|
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
|
||||||
|
end
|
||||||
|
|
||||||
|
main()
|
||||||
146
JuliaStream.jl/src/CUDAStream.jl
Normal file
146
JuliaStream.jl/src/CUDAStream.jl
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
include("Stream.jl")
|
||||||
|
using CUDA
|
||||||
|
|
||||||
|
const CuData = StreamData{T,CuArray{T}} where {T}
|
||||||
|
const TBSize = 1024::Int
|
||||||
|
const DotBlocks = 256::Int
|
||||||
|
|
||||||
|
function devices()
|
||||||
|
return !CUDA.functional(false) ? [] :
|
||||||
|
map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices())
|
||||||
|
end
|
||||||
|
|
||||||
|
function blocks(data::CuData{T})::Int where {T}
|
||||||
|
return data.size ÷ TBSize
|
||||||
|
end
|
||||||
|
|
||||||
|
function make_stream(
|
||||||
|
arraysize::Int,
|
||||||
|
scalar::T,
|
||||||
|
device::Int,
|
||||||
|
silent::Bool,
|
||||||
|
)::CuData{T} where {T}
|
||||||
|
|
||||||
|
if arraysize % TBSize != 0
|
||||||
|
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
|
||||||
|
end
|
||||||
|
|
||||||
|
# so CUDA's device is 0 indexed, so -1 from Julia
|
||||||
|
CUDA.device!(device - 1)
|
||||||
|
selected = CUDA.device()
|
||||||
|
# show_reason is set to true here so it dumps CUDA info
|
||||||
|
# for us regardless of whether it's functional
|
||||||
|
if !CUDA.functional(true)
|
||||||
|
error("Non-functional CUDA configuration")
|
||||||
|
end
|
||||||
|
data = CuData{T}(
|
||||||
|
CuArray{T}(undef, arraysize),
|
||||||
|
CuArray{T}(undef, arraysize),
|
||||||
|
CuArray{T}(undef, arraysize),
|
||||||
|
scalar,
|
||||||
|
arraysize,
|
||||||
|
)
|
||||||
|
if !silent
|
||||||
|
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
|
||||||
|
println("Kernel parameters: <<<$(blocks(data)),$(TBSize)>>>")
|
||||||
|
end
|
||||||
|
return data
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_arrays!(data::CuData{T}, init::Tuple{T,T,T}) where {T}
|
||||||
|
CUDA.fill!(data.a, init[1])
|
||||||
|
CUDA.fill!(data.b, init[2])
|
||||||
|
CUDA.fill!(data.c, init[3])
|
||||||
|
end
|
||||||
|
|
||||||
|
function copy!(data::CuData{T}) where {T}
|
||||||
|
function kernel(a, c)
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
@inbounds c[i] = a[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.c)
|
||||||
|
CUDA.synchronize()
|
||||||
|
end
|
||||||
|
|
||||||
|
function mul!(data::CuData{T}) where {T}
|
||||||
|
function kernel(b, c, scalar)
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
@inbounds b[i] = scalar * c[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
@cuda blocks = blocks(data) threads = TBSize kernel(data.b, data.c, data.scalar)
|
||||||
|
CUDA.synchronize()
|
||||||
|
end
|
||||||
|
|
||||||
|
function add!(data::CuData{T}) where {T}
|
||||||
|
function kernel(a, b, c)
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
@inbounds c[i] = a[i] + b[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c)
|
||||||
|
CUDA.synchronize()
|
||||||
|
end
|
||||||
|
|
||||||
|
function triad!(data::CuData{T}) where {T}
|
||||||
|
function kernel(a, b, c, scalar)
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
@inbounds a[i] = b[i] + (scalar * c[i])
|
||||||
|
return
|
||||||
|
end
|
||||||
|
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar)
|
||||||
|
CUDA.synchronize()
|
||||||
|
end
|
||||||
|
|
||||||
|
function nstream!(data::CuData{T}) where {T}
|
||||||
|
function kernel(a, b, c, scalar)
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
@inbounds a[i] += b[i] + scalar * c[i]
|
||||||
|
return
|
||||||
|
end
|
||||||
|
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar)
|
||||||
|
CUDA.synchronize()
|
||||||
|
end
|
||||||
|
|
||||||
|
function dot(data::CuData{T}) where {T}
|
||||||
|
# direct port of the reduction in CUDAStream.cu
|
||||||
|
function kernel(a, b, size, partial)
|
||||||
|
tb_sum = @cuStaticSharedMem(T, TBSize)
|
||||||
|
local_i = threadIdx().x
|
||||||
|
@inbounds tb_sum[local_i] = 0.0
|
||||||
|
|
||||||
|
# do dot first
|
||||||
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
while i <= size
|
||||||
|
@inbounds tb_sum[local_i] += a[i] * b[i]
|
||||||
|
i += blockDim().x * gridDim().x
|
||||||
|
end
|
||||||
|
|
||||||
|
# then tree reduction
|
||||||
|
offset = blockDim().x ÷ 2
|
||||||
|
while offset > 0
|
||||||
|
sync_threads()
|
||||||
|
if (local_i - 1) < offset
|
||||||
|
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
|
||||||
|
end
|
||||||
|
offset ÷= 2
|
||||||
|
end
|
||||||
|
|
||||||
|
if (local_i == 1)
|
||||||
|
@inbounds partial[blockIdx().x] = tb_sum[local_i]
|
||||||
|
end
|
||||||
|
|
||||||
|
return
|
||||||
|
end
|
||||||
|
partial_sum = CuArray{T}(undef, DotBlocks)
|
||||||
|
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
|
||||||
|
CUDA.synchronize()
|
||||||
|
return sum(partial_sum)
|
||||||
|
end
|
||||||
|
|
||||||
|
function read_data(data::CuData{T})::VectorData{T} where {T}
|
||||||
|
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
|
||||||
|
end
|
||||||
|
|
||||||
|
main()
|
||||||
84
JuliaStream.jl/src/DistributedStream.jl
Normal file
84
JuliaStream.jl/src/DistributedStream.jl
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
using Distributed
|
||||||
|
|
||||||
|
include("Stream.jl")
|
||||||
|
|
||||||
|
@everywhere include("StreamData.jl")
|
||||||
|
@everywhere using SharedArrays
|
||||||
|
@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T}
|
||||||
|
|
||||||
|
function devices()
|
||||||
|
return ["CPU (localhost)"]
|
||||||
|
end
|
||||||
|
|
||||||
|
function make_stream(
|
||||||
|
arraysize::Int,
|
||||||
|
scalar::T,
|
||||||
|
device::Int,
|
||||||
|
silent::Bool,
|
||||||
|
)::SharedArrayData{T} where {T}
|
||||||
|
if device != 1
|
||||||
|
error("Only CPU device is supported")
|
||||||
|
end
|
||||||
|
|
||||||
|
if !silent
|
||||||
|
println("Using max $(nworkers()) process(es) + 1 master")
|
||||||
|
end
|
||||||
|
return SharedArrayData{T}(
|
||||||
|
SharedArray{T}(arraysize),
|
||||||
|
SharedArray{T}(arraysize),
|
||||||
|
SharedArray{T}(arraysize),
|
||||||
|
scalar,
|
||||||
|
arraysize,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_arrays!(data::SharedArrayData{T}, init::Tuple{T,T,T}) where {T}
|
||||||
|
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = init[1]
|
||||||
|
@inbounds data.b[i] = init[2]
|
||||||
|
@inbounds data.c[i] = init[3]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function copy!(data::SharedArrayData{T}) where {T}
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function mul!(data::SharedArrayData{T}) where {T}
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.b[i] = data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function add!(data::SharedArrayData{T}) where {T}
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i] + data.b[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function triad!(data::SharedArrayData{T}) where {T}
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function nstream!(data::SharedArrayData{T}) where {T}
|
||||||
|
@sync @distributed for i = 1:data.size
|
||||||
|
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function dot(data::SharedArrayData{T}) where {T}
|
||||||
|
return @distributed (+) for i = 1:data.size
|
||||||
|
@inbounds data.a[i] * data.b[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function read_data(data::SharedArrayData{T})::VectorData{T} where {T}
|
||||||
|
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
|
||||||
|
end
|
||||||
|
|
||||||
|
main()
|
||||||
4
JuliaStream.jl/src/JuliaStream.jl
Normal file
4
JuliaStream.jl/src/JuliaStream.jl
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
module JuliaStream
|
||||||
|
end
|
||||||
|
|
||||||
|
println("Please run benchmarks directly via `julia --project src/<IMPL>Stream.jl`")
|
||||||
64
JuliaStream.jl/src/PlainStream.jl
Normal file
64
JuliaStream.jl/src/PlainStream.jl
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
include("Stream.jl")
|
||||||
|
|
||||||
|
function devices()
|
||||||
|
return ["CPU"]
|
||||||
|
end
|
||||||
|
|
||||||
|
function make_stream(arraysize::Int, scalar::T, device::Int, silent::Bool)::VectorData{T} where {T}
|
||||||
|
if device != 1
|
||||||
|
error("Only CPU device is supported")
|
||||||
|
end
|
||||||
|
return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize)
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = init[1]
|
||||||
|
@inbounds data.b[i] = init[2]
|
||||||
|
@inbounds data.c[i] = init[3]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function copy!(data::VectorData{T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function mul!(data::VectorData{T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.b[i] = data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function add!(data::VectorData{T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i] + data.b[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function triad!(data::VectorData{T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function nstream!(data::VectorData{T}) where {T}
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function dot(data::VectorData{T}) where {T}
|
||||||
|
sum = zero(T)
|
||||||
|
for i = 1:data.size
|
||||||
|
@inbounds sum += data.a[i] * data.b[i]
|
||||||
|
end
|
||||||
|
return sum
|
||||||
|
end
|
||||||
|
|
||||||
|
function read_data(data::VectorData{T})::VectorData{T} where {T}
|
||||||
|
return data
|
||||||
|
end
|
||||||
|
|
||||||
|
main()
|
||||||
292
JuliaStream.jl/src/Stream.jl
Normal file
292
JuliaStream.jl/src/Stream.jl
Normal file
@ -0,0 +1,292 @@
|
|||||||
|
using ArgParse
|
||||||
|
using Parameters
|
||||||
|
using Printf
|
||||||
|
using Base: Float64, Int
|
||||||
|
|
||||||
|
include("StreamData.jl")
|
||||||
|
|
||||||
|
const VectorData = StreamData{T,Vector{T}} where {T}
|
||||||
|
|
||||||
|
struct Timings
|
||||||
|
copy::Vector{Float64}
|
||||||
|
mul::Vector{Float64}
|
||||||
|
add::Vector{Float64}
|
||||||
|
triad::Vector{Float64}
|
||||||
|
dot::Vector{Float64}
|
||||||
|
Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n))
|
||||||
|
end
|
||||||
|
|
||||||
|
@enum Benchmark All Triad Nstream
|
||||||
|
|
||||||
|
function run_all!(data::StreamData{T,C}, times::Int)::Tuple{Timings,T} where {T,C}
|
||||||
|
timings = Timings(times)
|
||||||
|
lastSum::T = 0
|
||||||
|
for i = 1:times
|
||||||
|
@inbounds timings.copy[i] = @elapsed copy!(data)
|
||||||
|
@inbounds timings.mul[i] = @elapsed mul!(data)
|
||||||
|
@inbounds timings.add[i] = @elapsed add!(data)
|
||||||
|
@inbounds timings.triad[i] = @elapsed triad!(data)
|
||||||
|
@inbounds timings.dot[i] = @elapsed lastSum = dot(data)
|
||||||
|
end
|
||||||
|
return (timings, lastSum)
|
||||||
|
end
|
||||||
|
|
||||||
|
function run_triad!(data::StreamData{T,C}, times::Int)::Float64 where {T,C}
|
||||||
|
return @elapsed for _ = 1:times
|
||||||
|
triad!(data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function run_nstream!(data::StreamData{T,C}, times::Int)::Vector{Float64} where {T,C}
|
||||||
|
timings::Vector{Float64} = zeros(times)
|
||||||
|
for i = 1:times
|
||||||
|
@inbounds timings[i] = @elapsed nstream!(data)
|
||||||
|
end
|
||||||
|
return timings
|
||||||
|
end
|
||||||
|
|
||||||
|
function check_solutions(
|
||||||
|
data::StreamData{T,C},
|
||||||
|
times::Int,
|
||||||
|
init::Tuple{T,T,T},
|
||||||
|
benchmark::Benchmark,
|
||||||
|
dot::Union{T,Nothing},
|
||||||
|
) where {T,C}
|
||||||
|
(gold_a, gold_b, gold_c) = init
|
||||||
|
for _ = 1:times
|
||||||
|
if benchmark == All
|
||||||
|
gold_c = gold_a
|
||||||
|
gold_b = data.scalar * gold_c
|
||||||
|
gold_c = gold_a + gold_b
|
||||||
|
gold_a = gold_b + data.scalar * gold_c
|
||||||
|
elseif benchmark == Triad
|
||||||
|
gold_a = gold_b + data.scalar * gold_c
|
||||||
|
elseif benchmark == Nstream
|
||||||
|
gold_a += gold_b + data.scalar * gold_c
|
||||||
|
else
|
||||||
|
error("Unknown benchmark", benchmark)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
tolerance = eps(T) * 100
|
||||||
|
function validate_xs(name::String, xs::AbstractArray{T}, from::T)
|
||||||
|
error = (map(x -> abs(x - from), xs) |> sum) / length(xs)
|
||||||
|
failed = error > tolerance
|
||||||
|
if failed
|
||||||
|
println("Validation failed on $name. Average error $error")
|
||||||
|
end
|
||||||
|
!failed
|
||||||
|
end
|
||||||
|
a_valid = validate_xs("a", data.a, gold_a)
|
||||||
|
b_valid = validate_xs("b", data.b, gold_b)
|
||||||
|
c_valid = validate_xs("c", data.c, gold_c)
|
||||||
|
dot_valid =
|
||||||
|
dot !== nothing ?
|
||||||
|
begin
|
||||||
|
gold_sum = gold_a * gold_b * data.size
|
||||||
|
error = abs((dot - gold_sum) / gold_sum)
|
||||||
|
failed = error > 1.0e-8
|
||||||
|
if failed
|
||||||
|
println(
|
||||||
|
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
|
||||||
|
)
|
||||||
|
end
|
||||||
|
!failed
|
||||||
|
end : true
|
||||||
|
|
||||||
|
a_valid && b_valid && c_valid && dot_valid
|
||||||
|
end
|
||||||
|
|
||||||
|
@with_kw mutable struct Config
|
||||||
|
list::Bool = false
|
||||||
|
impl::String = "threaded"
|
||||||
|
device::Int = 1
|
||||||
|
numtimes::Int = 100
|
||||||
|
arraysize::Int = 33554432
|
||||||
|
float::Bool = false
|
||||||
|
triad_only::Bool = false
|
||||||
|
nstream_only::Bool = false
|
||||||
|
csv::Bool = false
|
||||||
|
mibibytes::Bool = false
|
||||||
|
end
|
||||||
|
|
||||||
|
function parse_options(given::Config)
|
||||||
|
s = ArgParseSettings()
|
||||||
|
@add_arg_table s begin
|
||||||
|
"--list"
|
||||||
|
help = "List available devices"
|
||||||
|
action = :store_true
|
||||||
|
"--device", "-d"
|
||||||
|
help = "Select device at DEVICE, NOTE: Julia is 1-indexed"
|
||||||
|
arg_type = Int
|
||||||
|
default = given.device
|
||||||
|
"--numtimes", "-n"
|
||||||
|
help = "Run the test NUMTIMES times (NUM >= 2)"
|
||||||
|
arg_type = Int
|
||||||
|
default = given.numtimes
|
||||||
|
"--arraysize", "-s"
|
||||||
|
help = "Use ARRAYSIZE elements in the array"
|
||||||
|
arg_type = Int
|
||||||
|
default = given.arraysize
|
||||||
|
"--float"
|
||||||
|
help = "Use floats (rather than doubles)"
|
||||||
|
action = :store_true
|
||||||
|
"--triad_only"
|
||||||
|
help = "Only run triad"
|
||||||
|
action = :store_true
|
||||||
|
"--nstream_only"
|
||||||
|
help = "Only run nstream"
|
||||||
|
action = :store_true
|
||||||
|
"--csv"
|
||||||
|
help = "Output as csv table"
|
||||||
|
action = :store_true
|
||||||
|
"--mibibytes"
|
||||||
|
help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
|
||||||
|
action = :store_true
|
||||||
|
end
|
||||||
|
args = parse_args(s)
|
||||||
|
# surely there's a better way than doing this:
|
||||||
|
for (arg, val) in args
|
||||||
|
setproperty!(given, Symbol(arg), val)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
const DefaultInit = (0.1, 0.2, 0.0)
|
||||||
|
const DefaultScalar = 0.4
|
||||||
|
const Version = "3.4.0"
|
||||||
|
|
||||||
|
function main()
|
||||||
|
|
||||||
|
config::Config = Config()
|
||||||
|
parse_options(config)
|
||||||
|
|
||||||
|
if config.list
|
||||||
|
ds = devices()
|
||||||
|
for (i, device) in enumerate(ds)
|
||||||
|
println("[$i] $(device)")
|
||||||
|
end
|
||||||
|
exit(0)
|
||||||
|
end
|
||||||
|
|
||||||
|
ds = devices()
|
||||||
|
if config.device < 1 || config.device > length(ds)
|
||||||
|
error(
|
||||||
|
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
if config.float
|
||||||
|
type = Float32
|
||||||
|
else
|
||||||
|
type = Float64
|
||||||
|
end
|
||||||
|
|
||||||
|
if config.nstream_only && !config.triad_only
|
||||||
|
benchmark = Nstream
|
||||||
|
elseif !config.nstream_only && config.triad_only
|
||||||
|
benchmark = Triad
|
||||||
|
elseif !config.nstream_only && !config.triad_only
|
||||||
|
benchmark = All
|
||||||
|
elseif config.nstream_only && config.triad_only
|
||||||
|
error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks")
|
||||||
|
else
|
||||||
|
error("Invalid config: $(repr(config))")
|
||||||
|
end
|
||||||
|
|
||||||
|
array_bytes = config.arraysize * sizeof(type)
|
||||||
|
total_bytes = array_bytes * 3
|
||||||
|
(mega_scale, mega_suffix, giga_scale, giga_suffix) =
|
||||||
|
!config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB")
|
||||||
|
|
||||||
|
if !config.csv
|
||||||
|
println("""BabelStream
|
||||||
|
Version: $Version
|
||||||
|
Implementation: Julia; $(config.impl)""")
|
||||||
|
println("Running kernels $(config.numtimes) times")
|
||||||
|
if benchmark == Triad
|
||||||
|
println("Number of elements: $(config.arraysize)")
|
||||||
|
end
|
||||||
|
println("Precision: $(config.float ? "float" : "double")")
|
||||||
|
r1 = n -> round(n; digits = 1)
|
||||||
|
println(
|
||||||
|
"Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)",
|
||||||
|
)
|
||||||
|
println(
|
||||||
|
"Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)",
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
|
||||||
|
tail = Base.rest(xs)
|
||||||
|
min = Iterators.minimum(tail)
|
||||||
|
max = Iterators.maximum(tail)
|
||||||
|
avg = Iterators.sum(tail) / Iterators.length(tail)
|
||||||
|
mbps = mega_scale * total_bytes / min
|
||||||
|
if config.csv
|
||||||
|
return [
|
||||||
|
("function", name),
|
||||||
|
("num_times", config.numtimes),
|
||||||
|
("n_elements", config.arraysize),
|
||||||
|
("sizeof", total_bytes),
|
||||||
|
("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps),
|
||||||
|
("min_runtime", min),
|
||||||
|
("max_runtime", max),
|
||||||
|
("avg_runtime", avg),
|
||||||
|
]
|
||||||
|
else
|
||||||
|
return [
|
||||||
|
("Function", name),
|
||||||
|
("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)),
|
||||||
|
("Min (sec)", round(min; digits = 5)),
|
||||||
|
("Max", round(max; digits = 5)),
|
||||||
|
("Average", round(avg; digits = 5)),
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function tabulate(rows::Vector{Tuple{String,Any}}...)
|
||||||
|
header = Base.first(rows)
|
||||||
|
padding = config.csv ? 0 : 12
|
||||||
|
sep = config.csv ? "," : ""
|
||||||
|
map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println
|
||||||
|
for row in rows
|
||||||
|
map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
init::Tuple{type,type,type} = DefaultInit
|
||||||
|
scalar::type = DefaultScalar
|
||||||
|
|
||||||
|
data = make_stream(config.arraysize, scalar, config.device, config.csv)
|
||||||
|
|
||||||
|
init_arrays!(data, init)
|
||||||
|
if benchmark == All
|
||||||
|
(timings, sum) = run_all!(data, config.numtimes)
|
||||||
|
valid = check_solutions(read_data(data), config.numtimes, init, benchmark, sum)
|
||||||
|
tabulate(
|
||||||
|
mk_row(timings.copy, "Copy", 2 * array_bytes),
|
||||||
|
mk_row(timings.mul, "Mul", 2 * array_bytes),
|
||||||
|
mk_row(timings.add, "Add", 3 * array_bytes),
|
||||||
|
mk_row(timings.triad, "Triad", 3 * array_bytes),
|
||||||
|
mk_row(timings.dot, "Dot", 2 * array_bytes),
|
||||||
|
)
|
||||||
|
elseif benchmark == Nstream
|
||||||
|
timings = run_nstream!(data, config.numtimes)
|
||||||
|
valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing)
|
||||||
|
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
|
||||||
|
elseif benchmark == Triad
|
||||||
|
elapsed = run_triad!(data, config.numtimes)
|
||||||
|
valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing)
|
||||||
|
total_bytes = 3 * array_bytes * config.numtimes
|
||||||
|
bandwidth = mega_scale * (total_bytes / elapsed)
|
||||||
|
println("Runtime (seconds): $(round(elapsed; digits=5))")
|
||||||
|
println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ")
|
||||||
|
else
|
||||||
|
error("Bad benchmark $(benchmark)")
|
||||||
|
end
|
||||||
|
|
||||||
|
if !valid
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
8
JuliaStream.jl/src/StreamData.jl
Normal file
8
JuliaStream.jl/src/StreamData.jl
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
struct StreamData{T,C<:AbstractArray{T}}
|
||||||
|
a::C
|
||||||
|
b::C
|
||||||
|
c::C
|
||||||
|
scalar::T
|
||||||
|
size::Int
|
||||||
|
end
|
||||||
72
JuliaStream.jl/src/ThreadedStream.jl
Normal file
72
JuliaStream.jl/src/ThreadedStream.jl
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
include("Stream.jl")
|
||||||
|
|
||||||
|
function devices()
|
||||||
|
return ["CPU"]
|
||||||
|
end
|
||||||
|
|
||||||
|
function make_stream(
|
||||||
|
arraysize::Int,
|
||||||
|
scalar::T,
|
||||||
|
device::Int,
|
||||||
|
silent::Bool,
|
||||||
|
)::VectorData{T} where {T}
|
||||||
|
if device != 1
|
||||||
|
error("Only CPU device is supported")
|
||||||
|
end
|
||||||
|
if !silent
|
||||||
|
println("Using max $(Threads.nthreads()) threads")
|
||||||
|
end
|
||||||
|
return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize)
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = init[1]
|
||||||
|
@inbounds data.b[i] = init[2]
|
||||||
|
@inbounds data.c[i] = init[3]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function copy!(data::VectorData{T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function mul!(data::VectorData{T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.b[i] = data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function add!(data::VectorData{T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.c[i] = data.a[i] + data.b[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function triad!(data::VectorData{T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function nstream!(data::VectorData{T}) where {T}
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function dot(data::VectorData{T}) where {T}
|
||||||
|
partial = zeros(T, Threads.nthreads())
|
||||||
|
Threads.@threads for i = 1:data.size
|
||||||
|
@inbounds partial[Threads.threadid()] += data.a[i] * data.b[i]
|
||||||
|
end
|
||||||
|
return sum(partial)
|
||||||
|
end
|
||||||
|
|
||||||
|
function read_data(data::VectorData{T})::VectorData{T} where {T}
|
||||||
|
return data
|
||||||
|
end
|
||||||
|
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user