From b3efa6af67cc7fed7cbba3b0e2dbc50f6010f99d Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 10 Jun 2021 04:20:40 +0100 Subject: [PATCH 01/17] Initial Julia implementation --- .github/workflows/main.yaml | 17 + JuliaStream.jl/.JuliaFormatter.toml | 2 + JuliaStream.jl/.gitignore | 5 + JuliaStream.jl/Manifest.toml | 411 ++++++++++++++++++++++++ JuliaStream.jl/Project.toml | 14 + JuliaStream.jl/README.md | 30 ++ JuliaStream.jl/src/AMDGPUStream.jl | 178 ++++++++++ JuliaStream.jl/src/CUDAStream.jl | 146 +++++++++ JuliaStream.jl/src/DistributedStream.jl | 84 +++++ JuliaStream.jl/src/JuliaStream.jl | 4 + JuliaStream.jl/src/PlainStream.jl | 64 ++++ JuliaStream.jl/src/Stream.jl | 292 +++++++++++++++++ JuliaStream.jl/src/StreamData.jl | 8 + JuliaStream.jl/src/ThreadedStream.jl | 72 +++++ 14 files changed, 1327 insertions(+) create mode 100644 JuliaStream.jl/.JuliaFormatter.toml create mode 100644 JuliaStream.jl/.gitignore create mode 100644 JuliaStream.jl/Manifest.toml create mode 100644 JuliaStream.jl/Project.toml create mode 100644 JuliaStream.jl/README.md create mode 100644 JuliaStream.jl/src/AMDGPUStream.jl create mode 100644 JuliaStream.jl/src/CUDAStream.jl create mode 100644 JuliaStream.jl/src/DistributedStream.jl create mode 100644 JuliaStream.jl/src/JuliaStream.jl create mode 100644 JuliaStream.jl/src/PlainStream.jl create mode 100644 JuliaStream.jl/src/Stream.jl create mode 100644 JuliaStream.jl/src/StreamData.jl create mode 100644 JuliaStream.jl/src/ThreadedStream.jl diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 20e1034..83bcd9d 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -3,6 +3,23 @@ on: [push, pull_request] jobs: + test-julia: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + - name: Setup project + run: julia --project -e 'import Pkg; Pkg.instantiate()' + - name: Test run PlainStream.jl + run: julia --project src/PlainStream.jl --arraysize 100 + - name: Test run ThreadedStream.jl + run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 100 + - name: Test run DistributedStream.jl + run: julia -p2 --project src/DistributedStream.jl --arraysize 100 + - name: Test run CUDAStream.jl + run: julia --project src/CUDAStream.jl --list + - name: Test run AMDGPUStream.jl + run: julia --project src/AMDGPUStream.jl --list + test: runs-on: ubuntu-18.04 steps: diff --git a/JuliaStream.jl/.JuliaFormatter.toml b/JuliaStream.jl/.JuliaFormatter.toml new file mode 100644 index 0000000..ac95ddd --- /dev/null +++ b/JuliaStream.jl/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +indent = 2 +margin = 100 \ No newline at end of file diff --git a/JuliaStream.jl/.gitignore b/JuliaStream.jl/.gitignore new file mode 100644 index 0000000..12b143b --- /dev/null +++ b/JuliaStream.jl/.gitignore @@ -0,0 +1,5 @@ +*.jl.cov +*.jl.*.cov +*.jl.mem +/docs/build/ +/docs/Manifest.toml \ No newline at end of file diff --git a/JuliaStream.jl/Manifest.toml b/JuliaStream.jl/Manifest.toml new file mode 100644 index 0000000..c60d77f --- /dev/null +++ b/JuliaStream.jl/Manifest.toml @@ -0,0 +1,411 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.7" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c3598e525718abcc440f69cc6d5f60dda0a1b61e" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.6+5" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.2.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "8b31cc69cbc38c5c826aaa1c890c694be3622d99" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "0.10.3" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.30.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "1dc43957fb9a1574fa1b7a449e101bd1fd3a9fb7" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.2.1" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.9" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "76cbf1134983cfb371ad77117bb2659600ed64d6" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.179.0+0" + +[[ExprTools]] +git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.3" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "6.4.1" + +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.11.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "b499c68a45249b0385585c62f4a9b62b5db8e691" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.7.1" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "LinearAlgebra"] +git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.2.4" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "7c6710c8198fd4444b5eb6a3840b7d47bd3593c5" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.3.1" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.4.0" + +[[Reexport]] +git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.1.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.1.0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "d5640fc570fb1b6c54512f0bd3853866bd298b3e" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.0" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.5.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "bf8aacc899a1bd16522d0350e1e2310510d77236" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.9" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9f76853ea2ba894054e24640abfb73d73e5a4cb5" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+0" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c4fa3457046fc93249b63e8319e743b6c8590609" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.0+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "3.7.0+1" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "3.8.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/Project.toml b/JuliaStream.jl/Project.toml new file mode 100644 index 0000000..0afa7d0 --- /dev/null +++ b/JuliaStream.jl/Project.toml @@ -0,0 +1,14 @@ +name = "JuliaStream" +uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681" +authors = ["Wei-Chen Lin "] +version = "3.4.0" + +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/README.md b/JuliaStream.jl/README.md new file mode 100644 index 0000000..7d94a34 --- /dev/null +++ b/JuliaStream.jl/README.md @@ -0,0 +1,30 @@ +JuliaStream.jl +============== + +This is an implementation of BabelStream in Julia which contains the following variants: + + * `PlainStream.jl` - Single threaded `for` + * `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros + * `DistributedStream.jl` - Process based parallelism with `@distributed` macros + * `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) + * `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) + +### Build & Run + +Prerequisites + + * Julia 1.6+ + +With Julia on path, run the benchmark with: + +```shell +> cd JuliaStream.jl +> julia --project -e 'import Pkg; Pkg.instantiate()' # only required on first run +> julia --project src/Stream.jl +``` + +**Important:** + * Julia is 1-indexed, so N > 1 in `--device N` + * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1 + * You must *prepend* the number of processes needed for `DistributedStream`, e.g `julia -p$(nproc) --project src/DistributedStream.jl` + * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl new file mode 100644 index 0000000..80f69b4 --- /dev/null +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -0,0 +1,178 @@ +# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs +include("Stream.jl") +using AMDGPU + +const ROCData = StreamData{T,ROCArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +# AMDGPU.agents()'s internal iteration order isn't stable +function gpu_agents_in_repr_order() + # XXX if we select anything other than :gpu, we get + # HSA_STATUS_ERROR_INVALID_AGENT on the first kernel submission + sort(AMDGPU.get_agents(:gpu), by = repr) +end + +function devices() + try + map(repr, gpu_agents_in_repr_order()) + catch + # probably unsupported + [] + end +end + +function gridsize(data::ROCData{T})::Int where {T} + return data.size +end + +function make_stream( + arraysize::Int, + scalar::T, + device::Int, + silent::Bool, +)::ROCData{T} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!() + # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand + AMDGPU.DEFAULT_AGENT[] = gpu_agents_in_repr_order()[device] + + data = ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ) + selected = AMDGPU.get_default_agent() + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + println("Kernel parameters : <<<$(gridsize(data)),$(TBSize)>>>") + end + return data +end + +function hard_wait(kernel) + # soft wait causes HSA_REFCOUNT overflow issues + AMDGPU.wait(kernel, soft = false) +end + +function init_arrays!(data::ROCData{T}, init::Tuple{T,T,T}) where {T} + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) +end + +function copy!(data::ROCData{T}) where {T} + function kernel(a, c) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds c[i] = a[i] + return + end + hard_wait(@roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.c)) +end + +function mul!(data::ROCData{T}) where {T} + function kernel(b, c, scalar) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds b[i] = scalar * c[i] + return + end + hard_wait( + @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.b, data.c, data.scalar) + ) +end + +function add!(data::ROCData{T}) where {T} + function kernel(a, b, c) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds c[i] = a[i] + b[i] + return + end + hard_wait( + @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.b, data.c) + ) +end + +function triad!(data::ROCData{T}) where {T} + function kernel(a, b, c, scalar) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + hard_wait( + @roc groupsize = TBSize gridsize = gridsize(data) kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function nstream!(data::ROCData{T}) where {T} + function kernel(a, b, c, scalar) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds a[i] += b[i] + scalar * c[i] + return + end + hard_wait( + @roc groupsize = TBSize gridsize = gridsize(data) kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function dot(data::ROCData{T}) where {T} + function kernel(a, b, size, partial) + tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize)) + local_i = workitemIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here + end + + # then tree reduction + offset = workgroupDim().x ÷ 2 + while offset > 0 + sync_workgroup() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[workgroupIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = ROCArray{T}(undef, DotBlocks) + hard_wait( + @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + ) + return sum(partial_sum) +end + +function read_data(data::ROCData{T})::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl new file mode 100644 index 0000000..c4d0510 --- /dev/null +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -0,0 +1,146 @@ +include("Stream.jl") +using CUDA + +const CuData = StreamData{T,CuArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +function devices() + return !CUDA.functional(false) ? [] : + map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices()) +end + +function blocks(data::CuData{T})::Int where {T} + return data.size ÷ TBSize +end + +function make_stream( + arraysize::Int, + scalar::T, + device::Int, + silent::Bool, +)::CuData{T} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + # so CUDA's device is 0 indexed, so -1 from Julia + CUDA.device!(device - 1) + selected = CUDA.device() + # show_reason is set to true here so it dumps CUDA info + # for us regardless of whether it's functional + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + data = CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ) + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + println("Kernel parameters: <<<$(blocks(data)),$(TBSize)>>>") + end + return data +end + +function init_arrays!(data::CuData{T}, init::Tuple{T,T,T}) where {T} + CUDA.fill!(data.a, init[1]) + CUDA.fill!(data.b, init[2]) + CUDA.fill!(data.c, init[3]) +end + +function copy!(data::CuData{T}) where {T} + function kernel(a, c) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + return + end + @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.c) + CUDA.synchronize() +end + +function mul!(data::CuData{T}) where {T} + function kernel(b, c, scalar) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds b[i] = scalar * c[i] + return + end + @cuda blocks = blocks(data) threads = TBSize kernel(data.b, data.c, data.scalar) + CUDA.synchronize() +end + +function add!(data::CuData{T}) where {T} + function kernel(a, b, c) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + b[i] + return + end + @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c) + CUDA.synchronize() +end + +function triad!(data::CuData{T}) where {T} + function kernel(a, b, c, scalar) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + CUDA.synchronize() +end + +function nstream!(data::CuData{T}) where {T} + function kernel(a, b, c, scalar) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + CUDA.synchronize() +end + +function dot(data::CuData{T}) where {T} + # direct port of the reduction in CUDAStream.cu + function kernel(a, b, size, partial) + tb_sum = @cuStaticSharedMem(T, TBSize) + local_i = threadIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += blockDim().x * gridDim().x + end + + # then tree reduction + offset = blockDim().x ÷ 2 + while offset > 0 + sync_threads() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[blockIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = CuArray{T}(undef, DotBlocks) + @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum) + CUDA.synchronize() + return sum(partial_sum) +end + +function read_data(data::CuData{T})::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/DistributedStream.jl b/JuliaStream.jl/src/DistributedStream.jl new file mode 100644 index 0000000..970c699 --- /dev/null +++ b/JuliaStream.jl/src/DistributedStream.jl @@ -0,0 +1,84 @@ +using Distributed + +include("Stream.jl") + +@everywhere include("StreamData.jl") +@everywhere using SharedArrays +@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} + +function devices() + return ["CPU (localhost)"] +end + +function make_stream( + arraysize::Int, + scalar::T, + device::Int, + silent::Bool, +)::SharedArrayData{T} where {T} + if device != 1 + error("Only CPU device is supported") + end + + if !silent + println("Using max $(nworkers()) process(es) + 1 master") + end + return SharedArrayData{T}( + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + scalar, + arraysize, + ) +end + +function init_arrays!(data::SharedArrayData{T}, init::Tuple{T,T,T}) where {T} + + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::SharedArrayData{T}) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::SharedArrayData{T}) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::SharedArrayData{T}) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::SharedArrayData{T}) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::SharedArrayData{T}) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::SharedArrayData{T}) where {T} + return @distributed (+) for i = 1:data.size + @inbounds data.a[i] * data.b[i] + end +end + +function read_data(data::SharedArrayData{T})::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/JuliaStream.jl b/JuliaStream.jl/src/JuliaStream.jl new file mode 100644 index 0000000..e01d46d --- /dev/null +++ b/JuliaStream.jl/src/JuliaStream.jl @@ -0,0 +1,4 @@ +module JuliaStream +end + +println("Please run benchmarks directly via `julia --project src/Stream.jl`") \ No newline at end of file diff --git a/JuliaStream.jl/src/PlainStream.jl b/JuliaStream.jl/src/PlainStream.jl new file mode 100644 index 0000000..259a9b7 --- /dev/null +++ b/JuliaStream.jl/src/PlainStream.jl @@ -0,0 +1,64 @@ +include("Stream.jl") + +function devices() + return ["CPU"] +end + +function make_stream(arraysize::Int, scalar::T, device::Int, silent::Bool)::VectorData{T} where {T} + if device != 1 + error("Only CPU device is supported") + end + return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize) +end + +function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} + for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}) where {T} + for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}) where {T} + for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}) where {T} + for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::VectorData{T}) where {T} + sum = zero(T) + for i = 1:data.size + @inbounds sum += data.a[i] * data.b[i] + end + return sum +end + +function read_data(data::VectorData{T})::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/Stream.jl b/JuliaStream.jl/src/Stream.jl new file mode 100644 index 0000000..cce846b --- /dev/null +++ b/JuliaStream.jl/src/Stream.jl @@ -0,0 +1,292 @@ +using ArgParse +using Parameters +using Printf +using Base: Float64, Int + +include("StreamData.jl") + +const VectorData = StreamData{T,Vector{T}} where {T} + +struct Timings + copy::Vector{Float64} + mul::Vector{Float64} + add::Vector{Float64} + triad::Vector{Float64} + dot::Vector{Float64} + Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n)) +end + +@enum Benchmark All Triad Nstream + +function run_all!(data::StreamData{T,C}, times::Int)::Tuple{Timings,T} where {T,C} + timings = Timings(times) + lastSum::T = 0 + for i = 1:times + @inbounds timings.copy[i] = @elapsed copy!(data) + @inbounds timings.mul[i] = @elapsed mul!(data) + @inbounds timings.add[i] = @elapsed add!(data) + @inbounds timings.triad[i] = @elapsed triad!(data) + @inbounds timings.dot[i] = @elapsed lastSum = dot(data) + end + return (timings, lastSum) +end + +function run_triad!(data::StreamData{T,C}, times::Int)::Float64 where {T,C} + return @elapsed for _ = 1:times + triad!(data) + end +end + +function run_nstream!(data::StreamData{T,C}, times::Int)::Vector{Float64} where {T,C} + timings::Vector{Float64} = zeros(times) + for i = 1:times + @inbounds timings[i] = @elapsed nstream!(data) + end + return timings +end + +function check_solutions( + data::StreamData{T,C}, + times::Int, + init::Tuple{T,T,T}, + benchmark::Benchmark, + dot::Union{T,Nothing}, +) where {T,C} + (gold_a, gold_b, gold_c) = init + for _ = 1:times + if benchmark == All + gold_c = gold_a + gold_b = data.scalar * gold_c + gold_c = gold_a + gold_b + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Triad + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Nstream + gold_a += gold_b + data.scalar * gold_c + else + error("Unknown benchmark", benchmark) + end + end + + tolerance = eps(T) * 100 + function validate_xs(name::String, xs::AbstractArray{T}, from::T) + error = (map(x -> abs(x - from), xs) |> sum) / length(xs) + failed = error > tolerance + if failed + println("Validation failed on $name. Average error $error") + end + !failed + end + a_valid = validate_xs("a", data.a, gold_a) + b_valid = validate_xs("b", data.b, gold_b) + c_valid = validate_xs("c", data.c, gold_c) + dot_valid = + dot !== nothing ? + begin + gold_sum = gold_a * gold_b * data.size + error = abs((dot - gold_sum) / gold_sum) + failed = error > 1.0e-8 + if failed + println( + "Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum", + ) + end + !failed + end : true + + a_valid && b_valid && c_valid && dot_valid +end + +@with_kw mutable struct Config + list::Bool = false + impl::String = "threaded" + device::Int = 1 + numtimes::Int = 100 + arraysize::Int = 33554432 + float::Bool = false + triad_only::Bool = false + nstream_only::Bool = false + csv::Bool = false + mibibytes::Bool = false +end + +function parse_options(given::Config) + s = ArgParseSettings() + @add_arg_table s begin + "--list" + help = "List available devices" + action = :store_true + "--device", "-d" + help = "Select device at DEVICE, NOTE: Julia is 1-indexed" + arg_type = Int + default = given.device + "--numtimes", "-n" + help = "Run the test NUMTIMES times (NUM >= 2)" + arg_type = Int + default = given.numtimes + "--arraysize", "-s" + help = "Use ARRAYSIZE elements in the array" + arg_type = Int + default = given.arraysize + "--float" + help = "Use floats (rather than doubles)" + action = :store_true + "--triad_only" + help = "Only run triad" + action = :store_true + "--nstream_only" + help = "Only run nstream" + action = :store_true + "--csv" + help = "Output as csv table" + action = :store_true + "--mibibytes" + help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + action = :store_true + end + args = parse_args(s) + # surely there's a better way than doing this: + for (arg, val) in args + setproperty!(given, Symbol(arg), val) + end +end + +const DefaultInit = (0.1, 0.2, 0.0) +const DefaultScalar = 0.4 +const Version = "3.4.0" + +function main() + + config::Config = Config() + parse_options(config) + + if config.list + ds = devices() + for (i, device) in enumerate(ds) + println("[$i] $(device)") + end + exit(0) + end + + ds = devices() + if config.device < 1 || config.device > length(ds) + error( + "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", + ) + end + + if config.float + type = Float32 + else + type = Float64 + end + + if config.nstream_only && !config.triad_only + benchmark = Nstream + elseif !config.nstream_only && config.triad_only + benchmark = Triad + elseif !config.nstream_only && !config.triad_only + benchmark = All + elseif config.nstream_only && config.triad_only + error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks") + else + error("Invalid config: $(repr(config))") + end + + array_bytes = config.arraysize * sizeof(type) + total_bytes = array_bytes * 3 + (mega_scale, mega_suffix, giga_scale, giga_suffix) = + !config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB") + + if !config.csv + println("""BabelStream + Version: $Version + Implementation: Julia; $(config.impl)""") + println("Running kernels $(config.numtimes) times") + if benchmark == Triad + println("Number of elements: $(config.arraysize)") + end + println("Precision: $(config.float ? "float" : "double")") + r1 = n -> round(n; digits = 1) + println( + "Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)", + ) + println( + "Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)", + ) + end + + function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) + tail = Base.rest(xs) + min = Iterators.minimum(tail) + max = Iterators.maximum(tail) + avg = Iterators.sum(tail) / Iterators.length(tail) + mbps = mega_scale * total_bytes / min + if config.csv + return [ + ("function", name), + ("num_times", config.numtimes), + ("n_elements", config.arraysize), + ("sizeof", total_bytes), + ("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps), + ("min_runtime", min), + ("max_runtime", max), + ("avg_runtime", avg), + ] + else + return [ + ("Function", name), + ("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)), + ("Min (sec)", round(min; digits = 5)), + ("Max", round(max; digits = 5)), + ("Average", round(avg; digits = 5)), + ] + end + end + + function tabulate(rows::Vector{Tuple{String,Any}}...) + header = Base.first(rows) + padding = config.csv ? 0 : 12 + sep = config.csv ? "," : "" + map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println + for row in rows + map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println + end + end + + init::Tuple{type,type,type} = DefaultInit + scalar::type = DefaultScalar + + data = make_stream(config.arraysize, scalar, config.device, config.csv) + + init_arrays!(data, init) + if benchmark == All + (timings, sum) = run_all!(data, config.numtimes) + valid = check_solutions(read_data(data), config.numtimes, init, benchmark, sum) + tabulate( + mk_row(timings.copy, "Copy", 2 * array_bytes), + mk_row(timings.mul, "Mul", 2 * array_bytes), + mk_row(timings.add, "Add", 3 * array_bytes), + mk_row(timings.triad, "Triad", 3 * array_bytes), + mk_row(timings.dot, "Dot", 2 * array_bytes), + ) + elseif benchmark == Nstream + timings = run_nstream!(data, config.numtimes) + valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing) + tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) + elseif benchmark == Triad + elapsed = run_triad!(data, config.numtimes) + valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing) + total_bytes = 3 * array_bytes * config.numtimes + bandwidth = mega_scale * (total_bytes / elapsed) + println("Runtime (seconds): $(round(elapsed; digits=5))") + println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ") + else + error("Bad benchmark $(benchmark)") + end + + if !valid + exit(1) + end + +end diff --git a/JuliaStream.jl/src/StreamData.jl b/JuliaStream.jl/src/StreamData.jl new file mode 100644 index 0000000..07498fe --- /dev/null +++ b/JuliaStream.jl/src/StreamData.jl @@ -0,0 +1,8 @@ + +struct StreamData{T,C<:AbstractArray{T}} + a::C + b::C + c::C + scalar::T + size::Int + end \ No newline at end of file diff --git a/JuliaStream.jl/src/ThreadedStream.jl b/JuliaStream.jl/src/ThreadedStream.jl new file mode 100644 index 0000000..fb995e6 --- /dev/null +++ b/JuliaStream.jl/src/ThreadedStream.jl @@ -0,0 +1,72 @@ +include("Stream.jl") + +function devices() + return ["CPU"] +end + +function make_stream( + arraysize::Int, + scalar::T, + device::Int, + silent::Bool, +)::VectorData{T} where {T} + if device != 1 + error("Only CPU device is supported") + end + if !silent + println("Using max $(Threads.nthreads()) threads") + end + return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize) +end + +function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::VectorData{T}) where {T} + partial = zeros(T, Threads.nthreads()) + Threads.@threads for i = 1:data.size + @inbounds partial[Threads.threadid()] += data.a[i] * data.b[i] + end + return sum(partial) +end + +function read_data(data::VectorData{T})::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file From 63f471f8800dfb8d9bcb2861d3b0f6ec1231f47a Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 10 Jun 2021 04:33:12 +0100 Subject: [PATCH 02/17] set pwd to JuliaStream.jl for CI run --- .github/workflows/main.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 83bcd9d..3c90b37 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -5,6 +5,9 @@ on: [push, pull_request] jobs: test-julia: runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./JuliaStream.jl steps: - uses: actions/checkout@v2 - name: Setup project From 2cf8ca5f8cdd3b2efb5653dde4da8e5c61143cf1 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 10 Jun 2021 04:57:52 +0100 Subject: [PATCH 03/17] Use addprocs() for DistributedStream --- .github/workflows/main.yaml | 5 +++++ JuliaStream.jl/README.md | 2 +- JuliaStream.jl/src/DistributedStream.jl | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 3c90b37..f666150 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -13,14 +13,19 @@ jobs: - name: Setup project run: julia --project -e 'import Pkg; Pkg.instantiate()' - name: Test run PlainStream.jl + if: ${{ ! cancelled() }} run: julia --project src/PlainStream.jl --arraysize 100 - name: Test run ThreadedStream.jl + if: ${{ ! cancelled() }} run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 100 - name: Test run DistributedStream.jl + if: ${{ ! cancelled() }} run: julia -p2 --project src/DistributedStream.jl --arraysize 100 - name: Test run CUDAStream.jl + if: ${{ ! cancelled() }} run: julia --project src/CUDAStream.jl --list - name: Test run AMDGPUStream.jl + if: ${{ ! cancelled() }} run: julia --project src/AMDGPUStream.jl --list test: diff --git a/JuliaStream.jl/README.md b/JuliaStream.jl/README.md index 7d94a34..9126167 100644 --- a/JuliaStream.jl/README.md +++ b/JuliaStream.jl/README.md @@ -26,5 +26,5 @@ With Julia on path, run the benchmark with: **Important:** * Julia is 1-indexed, so N > 1 in `--device N` * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1 - * You must *prepend* the number of processes needed for `DistributedStream`, e.g `julia -p$(nproc) --project src/DistributedStream.jl` + * `DistributedStream` uses `addprocs()` call directly which defaults to `$(nproc)`, **do not use the `-p ` flag** as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. diff --git a/JuliaStream.jl/src/DistributedStream.jl b/JuliaStream.jl/src/DistributedStream.jl index 970c699..361c737 100644 --- a/JuliaStream.jl/src/DistributedStream.jl +++ b/JuliaStream.jl/src/DistributedStream.jl @@ -2,6 +2,8 @@ using Distributed include("Stream.jl") +addprocs() + @everywhere include("StreamData.jl") @everywhere using SharedArrays @everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} From c5ad3f34d984c7d481c1a5d3d33303de5079207f Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 10 Jun 2021 05:01:24 +0100 Subject: [PATCH 04/17] Drop -p N for DistributedStream.jl CI --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f666150..0d7bfd1 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -20,7 +20,7 @@ jobs: run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 100 - name: Test run DistributedStream.jl if: ${{ ! cancelled() }} - run: julia -p2 --project src/DistributedStream.jl --arraysize 100 + run: julia --project src/DistributedStream.jl --arraysize 100 - name: Test run CUDAStream.jl if: ${{ ! cancelled() }} run: julia --project src/CUDAStream.jl --list From d799535c966364a7209ec40c4ca8d2f3093e936b Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 10 Jun 2021 05:06:48 +0100 Subject: [PATCH 05/17] Larger arraysize for CI --- .github/workflows/main.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0d7bfd1..d54850e 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -14,13 +14,13 @@ jobs: run: julia --project -e 'import Pkg; Pkg.instantiate()' - name: Test run PlainStream.jl if: ${{ ! cancelled() }} - run: julia --project src/PlainStream.jl --arraysize 100 + run: julia --project src/PlainStream.jl --arraysize 2048 - name: Test run ThreadedStream.jl if: ${{ ! cancelled() }} - run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 100 + run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048 - name: Test run DistributedStream.jl if: ${{ ! cancelled() }} - run: julia --project src/DistributedStream.jl --arraysize 100 + run: julia --project src/DistributedStream.jl --arraysize 2048 - name: Test run CUDAStream.jl if: ${{ ! cancelled() }} run: julia --project src/CUDAStream.jl --list From 4e6c56729bfa77e1839b3069c2075e4fa3cfab98 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 18:09:54 +0100 Subject: [PATCH 06/17] Inline AMDGPU's hard_wait Show the selected implementation and not a constant "threaded" --- JuliaStream.jl/src/AMDGPUStream.jl | 25 ++++++++++++++----------- JuliaStream.jl/src/Stream.jl | 3 +-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index 80f69b4..cb54904 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -56,11 +56,6 @@ function make_stream( return data end -function hard_wait(kernel) - # soft wait causes HSA_REFCOUNT overflow issues - AMDGPU.wait(kernel, soft = false) -end - function init_arrays!(data::ROCData{T}, init::Tuple{T,T,T}) where {T} AMDGPU.fill!(data.a, init[1]) AMDGPU.fill!(data.b, init[2]) @@ -73,7 +68,10 @@ function copy!(data::ROCData{T}) where {T} @inbounds c[i] = a[i] return end - hard_wait(@roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.c)) + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues + @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.c) + ) end function mul!(data::ROCData{T}) where {T} @@ -82,7 +80,8 @@ function mul!(data::ROCData{T}) where {T} @inbounds b[i] = scalar * c[i] return end - hard_wait( + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.b, data.c, data.scalar) ) end @@ -93,7 +92,8 @@ function add!(data::ROCData{T}) where {T} @inbounds c[i] = a[i] + b[i] return end - hard_wait( + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.b, data.c) ) end @@ -104,7 +104,8 @@ function triad!(data::ROCData{T}) where {T} @inbounds a[i] = b[i] + (scalar * c[i]) return end - hard_wait( + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = gridsize(data) kernel( data.a, data.b, @@ -120,7 +121,8 @@ function nstream!(data::ROCData{T}) where {T} @inbounds a[i] += b[i] + scalar * c[i] return end - hard_wait( + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = gridsize(data) kernel( data.a, data.b, @@ -160,7 +162,8 @@ function dot(data::ROCData{T}) where {T} return end partial_sum = ROCArray{T}(undef, DotBlocks) - hard_wait( + AMDGPU.wait( + soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( data.a, data.b, diff --git a/JuliaStream.jl/src/Stream.jl b/JuliaStream.jl/src/Stream.jl index cce846b..590ab2d 100644 --- a/JuliaStream.jl/src/Stream.jl +++ b/JuliaStream.jl/src/Stream.jl @@ -99,7 +99,6 @@ end @with_kw mutable struct Config list::Bool = false - impl::String = "threaded" device::Int = 1 numtimes::Int = 100 arraysize::Int = 33554432 @@ -201,7 +200,7 @@ function main() if !config.csv println("""BabelStream Version: $Version - Implementation: Julia; $(config.impl)""") + Implementation: Julia; $(PROGRAM_FILE)""") println("Running kernels $(config.numtimes) times") if benchmark == Triad println("Number of elements: $(config.arraysize)") From d675875dcd169e1a0748234eac6013b1975886b8 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 19:03:39 +0100 Subject: [PATCH 07/17] Switch back to -p for DistributedStream --- JuliaStream.jl/README.md | 8 ++++---- JuliaStream.jl/src/DistributedStream.jl | 7 +++---- JuliaStream.jl/src/StreamData.jl | 13 ++++++------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/JuliaStream.jl/README.md b/JuliaStream.jl/README.md index 9126167..851a59d 100644 --- a/JuliaStream.jl/README.md +++ b/JuliaStream.jl/README.md @@ -13,7 +13,7 @@ This is an implementation of BabelStream in Julia which contains the following v Prerequisites - * Julia 1.6+ + * Julia >= 1.6+ With Julia on path, run the benchmark with: @@ -24,7 +24,7 @@ With Julia on path, run the benchmark with: ``` **Important:** - * Julia is 1-indexed, so N > 1 in `--device N` - * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1 - * `DistributedStream` uses `addprocs()` call directly which defaults to `$(nproc)`, **do not use the `-p ` flag** as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). + * Julia is 1-indexed, so N >= 1 in `--device N`. + * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. + * Worker count for `DistributedStream` is set with `-p ` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. diff --git a/JuliaStream.jl/src/DistributedStream.jl b/JuliaStream.jl/src/DistributedStream.jl index 361c737..2aa7ae7 100644 --- a/JuliaStream.jl/src/DistributedStream.jl +++ b/JuliaStream.jl/src/DistributedStream.jl @@ -1,10 +1,9 @@ using Distributed -include("Stream.jl") - -addprocs() - +@everywhere using Pkg +@everywhere Pkg.activate("."; io=devnull) # don't spam `Activating environment at...` @everywhere include("StreamData.jl") +@everywhere include("Stream.jl") @everywhere using SharedArrays @everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} diff --git a/JuliaStream.jl/src/StreamData.jl b/JuliaStream.jl/src/StreamData.jl index 07498fe..55e055a 100644 --- a/JuliaStream.jl/src/StreamData.jl +++ b/JuliaStream.jl/src/StreamData.jl @@ -1,8 +1,7 @@ - struct StreamData{T,C<:AbstractArray{T}} - a::C - b::C - c::C - scalar::T - size::Int - end \ No newline at end of file + a::C + b::C + c::C + scalar::T + size::Int +end From 418315543ccaf46d70eb5999902fc4ef3c06e9cc Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 19:09:37 +0100 Subject: [PATCH 08/17] Use -p 2 and no arg for JuliaStream in CI --- .github/workflows/main.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index d54850e..2427ed1 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -18,9 +18,12 @@ jobs: - name: Test run ThreadedStream.jl if: ${{ ! cancelled() }} run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048 - - name: Test run DistributedStream.jl + - name: Test run DistributedStream.jl (no flag) if: ${{ ! cancelled() }} run: julia --project src/DistributedStream.jl --arraysize 2048 + - name: Test run DistributedStream.jl (-p 2) + if: ${{ ! cancelled() }} + run: julia -p 2 --project src/DistributedStream.jl --arraysize 2048 - name: Test run CUDAStream.jl if: ${{ ! cancelled() }} run: julia --project src/CUDAStream.jl --list From 2e957d3f604b2aa3109baada0f132f024dca906e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 19:20:37 +0100 Subject: [PATCH 09/17] Inline blocks in CUDAStream --- JuliaStream.jl/src/CUDAStream.jl | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index c4d0510..dbf21d7 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -10,10 +10,6 @@ function devices() map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices()) end -function blocks(data::CuData{T})::Int where {T} - return data.size ÷ TBSize -end - function make_stream( arraysize::Int, scalar::T, @@ -42,7 +38,7 @@ function make_stream( ) if !silent println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") - println("Kernel parameters: <<<$(blocks(data)),$(TBSize)>>>") + println("Kernel parameters: <<<$(data.size ÷ TBSize),$(TBSize)>>>") end return data end @@ -59,7 +55,7 @@ function copy!(data::CuData{T}) where {T} @inbounds c[i] = a[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.c) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c) CUDA.synchronize() end @@ -69,7 +65,7 @@ function mul!(data::CuData{T}) where {T} @inbounds b[i] = scalar * c[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar) CUDA.synchronize() end @@ -79,7 +75,7 @@ function add!(data::CuData{T}) where {T} @inbounds c[i] = a[i] + b[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c) CUDA.synchronize() end @@ -89,7 +85,12 @@ function triad!(data::CuData{T}) where {T} @inbounds a[i] = b[i] + (scalar * c[i]) return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) CUDA.synchronize() end @@ -99,7 +100,12 @@ function nstream!(data::CuData{T}) where {T} @inbounds a[i] += b[i] + scalar * c[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) CUDA.synchronize() end From 7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 19:31:42 +0100 Subject: [PATCH 10/17] Add comment about blockIdx/workgroupIdx in Julia --- JuliaStream.jl/src/AMDGPUStream.jl | 12 ++++++------ JuliaStream.jl/src/CUDAStream.jl | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index cb54904..9a9cd9a 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -64,7 +64,7 @@ end function copy!(data::ROCData{T}) where {T} function kernel(a, c) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] return end @@ -76,7 +76,7 @@ end function mul!(data::ROCData{T}) where {T} function kernel(b, c, scalar) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds b[i] = scalar * c[i] return end @@ -88,7 +88,7 @@ end function add!(data::ROCData{T}) where {T} function kernel(a, b, c) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] + b[i] return end @@ -100,7 +100,7 @@ end function triad!(data::ROCData{T}) where {T} function kernel(a, b, c, scalar) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] = b[i] + (scalar * c[i]) return end @@ -117,7 +117,7 @@ end function nstream!(data::ROCData{T}) where {T} function kernel(a, b, c, scalar) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] += b[i] + scalar * c[i] return end @@ -139,7 +139,7 @@ function dot(data::ROCData{T}) where {T} @inbounds tb_sum[local_i] = 0.0 # do dot first - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 while i <= size @inbounds tb_sum[local_i] += a[i] * b[i] i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index dbf21d7..7d671a5 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -51,7 +51,7 @@ end function copy!(data::CuData{T}) where {T} function kernel(a, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds c[i] = a[i] return end @@ -61,7 +61,7 @@ end function mul!(data::CuData{T}) where {T} function kernel(b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds b[i] = scalar * c[i] return end @@ -71,7 +71,7 @@ end function add!(data::CuData{T}) where {T} function kernel(a, b, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds c[i] = a[i] + b[i] return end @@ -81,7 +81,7 @@ end function triad!(data::CuData{T}) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds a[i] = b[i] + (scalar * c[i]) return end @@ -96,7 +96,7 @@ end function nstream!(data::CuData{T}) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds a[i] += b[i] + scalar * c[i] return end @@ -117,7 +117,7 @@ function dot(data::CuData{T}) where {T} @inbounds tb_sum[local_i] = 0.0 # do dot first - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 while i <= size @inbounds tb_sum[local_i] += a[i] * b[i] i += blockDim().x * gridDim().x From a26699c5b50dbf307ce44325b6e05491f7ad5376 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 17 Aug 2021 14:28:47 +0100 Subject: [PATCH 11/17] Add oneAPI and KA implementation Isolate projects to avoid transitive dependency Add parameter for passing devices Incorporate further reviews Update all dependencies --- JuliaStream.jl/AMDGPU/Manifest.toml | 415 +++++++++++++ JuliaStream.jl/AMDGPU/Project.toml | 7 + JuliaStream.jl/CUDA/Manifest.toml | 316 ++++++++++ JuliaStream.jl/CUDA/Project.toml | 7 + .../KernelAbstractions/Manifest.toml | 547 ++++++++++++++++++ .../KernelAbstractions/Project.toml | 11 + JuliaStream.jl/Manifest.toml | 152 +++-- JuliaStream.jl/Project.toml | 5 + JuliaStream.jl/README.md | 43 +- JuliaStream.jl/Threaded/Manifest.toml | 31 + JuliaStream.jl/Threaded/Project.toml | 6 + JuliaStream.jl/oneAPI/Manifest.toml | 319 ++++++++++ JuliaStream.jl/oneAPI/Project.toml | 7 + JuliaStream.jl/src/AMDGPUStream.jl | 72 +-- JuliaStream.jl/src/CUDAStream.jl | 48 +- JuliaStream.jl/src/DistributedStream.jl | 44 +- .../src/KernelAbstractionsStream.jl | 255 ++++++++ JuliaStream.jl/src/Stream.jl | 58 +- JuliaStream.jl/src/ThreadedStream.jl | 38 +- JuliaStream.jl/src/oneAPIStream.jl | 170 ++++++ JuliaStream.jl/update_all.sh | 7 + 21 files changed, 2393 insertions(+), 165 deletions(-) create mode 100644 JuliaStream.jl/AMDGPU/Manifest.toml create mode 100644 JuliaStream.jl/AMDGPU/Project.toml create mode 100644 JuliaStream.jl/CUDA/Manifest.toml create mode 100644 JuliaStream.jl/CUDA/Project.toml create mode 100644 JuliaStream.jl/KernelAbstractions/Manifest.toml create mode 100644 JuliaStream.jl/KernelAbstractions/Project.toml create mode 100644 JuliaStream.jl/Threaded/Manifest.toml create mode 100644 JuliaStream.jl/Threaded/Project.toml create mode 100644 JuliaStream.jl/oneAPI/Manifest.toml create mode 100644 JuliaStream.jl/oneAPI/Project.toml create mode 100644 JuliaStream.jl/src/KernelAbstractionsStream.jl create mode 100644 JuliaStream.jl/src/oneAPIStream.jl create mode 100755 JuliaStream.jl/update_all.sh diff --git a/JuliaStream.jl/AMDGPU/Manifest.toml b/JuliaStream.jl/AMDGPU/Manifest.toml new file mode 100644 index 0000000..6e27f0a --- /dev/null +++ b/JuliaStream.jl/AMDGPU/Manifest.toml @@ -0,0 +1,415 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.2.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.7+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/AMDGPU/Project.toml b/JuliaStream.jl/AMDGPU/Project.toml new file mode 100644 index 0000000..5ab8447 --- /dev/null +++ b/JuliaStream.jl/AMDGPU/Project.toml @@ -0,0 +1,7 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/CUDA/Manifest.toml b/JuliaStream.jl/CUDA/Manifest.toml new file mode 100644 index 0000000..7330228 --- /dev/null +++ b/JuliaStream.jl/CUDA/Manifest.toml @@ -0,0 +1,316 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "9303b20dfa74e4bcb4da425d351d551fbb5850be" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.4.0" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.33.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.2.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.7+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.1.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/CUDA/Project.toml b/JuliaStream.jl/CUDA/Project.toml new file mode 100644 index 0000000..e50582e --- /dev/null +++ b/JuliaStream.jl/CUDA/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/KernelAbstractions/Manifest.toml b/JuliaStream.jl/KernelAbstractions/Manifest.toml new file mode 100644 index 0000000..5c24cf5 --- /dev/null +++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -0,0 +1,547 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "5e696e37e51b01ae07bd9f700afe6cbd55250bce" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.3.4" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.7" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.33.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.2.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.7+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.1.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/KernelAbstractions/Project.toml b/JuliaStream.jl/KernelAbstractions/Project.toml new file mode 100644 index 0000000..71715ff --- /dev/null +++ b/JuliaStream.jl/KernelAbstractions/Project.toml @@ -0,0 +1,11 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/Manifest.toml b/JuliaStream.jl/Manifest.toml index c60d77f..14f2029 100644 --- a/JuliaStream.jl/Manifest.toml +++ b/JuliaStream.jl/Manifest.toml @@ -47,9 +47,9 @@ version = "0.5.10" [[Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c3598e525718abcc440f69cc6d5f60dda0a1b61e" +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.6+5" +version = "1.0.8+0" [[CEnum]] git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" @@ -62,17 +62,28 @@ git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" version = "3.2.1" +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.7" + [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "8b31cc69cbc38c5c826aaa1c890c694be3622d99" +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "0.10.3" +version = "1.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab" +git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.30.0" +version = "3.33.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -80,15 +91,15 @@ uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" [[ConstructionBase]] deps = ["LinearAlgebra"] -git-tree-sha1 = "1dc43957fb9a1574fa1b7a449e101bd1fd3a9fb7" +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.2.1" +version = "1.3.0" [[DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.9" +version = "0.18.10" [[Dates]] deps = ["Printf"] @@ -114,14 +125,14 @@ uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[Elfutils_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "76cbf1134983cfb371ad77117bb2659600ed64d6" +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.179.0+0" +version = "0.182.0+0" [[ExprTools]] -git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.3" +version = "0.1.6" [[Future]] deps = ["Random"] @@ -143,17 +154,28 @@ version = "0.11.5" deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + [[JLLWrappers]] deps = ["Preferences"] git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.3.0" +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + [[LLVM]] deps = ["CEnum", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "b499c68a45249b0385585c62f4a9b62b5db8e691" +git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "3.7.1" +version = "3.9.0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] @@ -183,19 +205,19 @@ deps = ["Libdl"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[LogExpFunctions]] -deps = ["DocStringExtensions", "LinearAlgebra"] -git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885" +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.2.4" +version = "0.3.0" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" [[MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.6" +version = "0.5.7" [[Markdown]] deps = ["Base64"] @@ -217,6 +239,12 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.12.19358+0" + [[NUMA_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" @@ -261,21 +289,27 @@ uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + [[Random]] deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[Random123]] deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "7c6710c8198fd4444b5eb6a3840b7d47bd3593c5" +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.3.1" +version = "1.4.2" [[RandomNumbers]] deps = ["Random", "Requires"] -git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" -version = "1.4.0" +version = "1.5.3" [[Reexport]] git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" @@ -291,6 +325,18 @@ version = "1.1.3" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + [[Scratch]] deps = ["Dates"] git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" @@ -302,9 +348,9 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[Setfield]] deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "d5640fc570fb1b6c54512f0bd3853866bd298b3e" +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.0" +version = "0.7.1" [[SharedArrays]] deps = ["Distributed", "Mmap", "Random", "Serialization"] @@ -319,9 +365,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[SpecialFunctions]] deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] -git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49" +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.5.1" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] @@ -346,9 +398,9 @@ version = "1.0.1" [[TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "bf8aacc899a1bd16522d0350e1e2310510d77236" +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.9" +version = "0.5.12" [[UUIDs]] deps = ["Random", "SHA"] @@ -364,9 +416,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [[XZ_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "9f76853ea2ba894054e24640abfb73d73e5a4cb5" +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+0" +version = "5.2.5+2" [[Zlib_jll]] deps = ["Libdl"] @@ -374,9 +426,9 @@ uuid = "83775a58-1f1d-513f-b197-d71354ab007a" [[argp_standalone_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c4fa3457046fc93249b63e8319e743b6c8590609" +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" -version = "1.3.0+0" +version = "1.3.1+0" [[fts_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -384,6 +436,12 @@ git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" version = "1.2.7+1" +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "20.3.2+0" + [[hsa_rocr_jll]] deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d" @@ -396,6 +454,12 @@ git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041" uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" version = "3.8.0+0" +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.6712+0" + [[nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" @@ -406,6 +470,24 @@ git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" version = "1.2.2+0" +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.1.3" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.1.2+1" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.3.6+0" + [[p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/Project.toml b/JuliaStream.jl/Project.toml index 0afa7d0..9c7d49d 100644 --- a/JuliaStream.jl/Project.toml +++ b/JuliaStream.jl/Project.toml @@ -7,8 +7,13 @@ version = "3.4.0" AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] julia = "1.6" diff --git a/JuliaStream.jl/README.md b/JuliaStream.jl/README.md index 851a59d..6204da7 100644 --- a/JuliaStream.jl/README.md +++ b/JuliaStream.jl/README.md @@ -8,6 +8,8 @@ This is an implementation of BabelStream in Julia which contains the following v * `DistributedStream.jl` - Process based parallelism with `@distributed` macros * `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) * `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) + * `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) + * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) ### Build & Run @@ -15,12 +17,35 @@ Prerequisites * Julia >= 1.6+ -With Julia on path, run the benchmark with: +A set of reduced dependency projects are available for the following backend and implementations: + + * `AMDGPU` supports: + - `AMDGPUStream.jl` + * `CUDA` supports: + - `CUDAStream.jl` + * `oneAPI` supports: + - `oneAPIStream.jl` + * `KernelAbstractions` supports: + - `KernelAbstractionsStream.jl` + * `Threaded` supports: + - `PlainStream.jl` + - `ThreadedStream.jl` + - `DistributedStream.jl` + +With Julia on path, run your selected benchmark with: ```shell > cd JuliaStream.jl -> julia --project -e 'import Pkg; Pkg.instantiate()' # only required on first run -> julia --project src/Stream.jl +> julia --project= -e 'import Pkg; Pkg.instantiate()' # only required on first run +> julia --project= src/Stream.jl +``` + +For example. to run the CUDA implementation: + +```shell +> cd JuliaStream.jl +> julia --project=CUDA -e 'import Pkg; Pkg.instantiate()' +> julia --project=CUDA src/CUDAStream.jl ``` **Important:** @@ -28,3 +53,15 @@ With Julia on path, run the benchmark with: * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. * Worker count for `DistributedStream` is set with `-p ` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. + +*** + +Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`. +There may be instances where some packages are locked to an older version because of transitive dependency requirements. + +To run the benchmark using the top-level project, run the benchmark with: +```shell +> cd JuliaStream.jl +> julia --project -e 'import Pkg; Pkg.instantiate()' +> julia --project src/Stream.jl +``` \ No newline at end of file diff --git a/JuliaStream.jl/Threaded/Manifest.toml b/JuliaStream.jl/Threaded/Manifest.toml new file mode 100644 index 0000000..608e2da --- /dev/null +++ b/JuliaStream.jl/Threaded/Manifest.toml @@ -0,0 +1,31 @@ +# This file is machine-generated - editing it directly is not advised + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" diff --git a/JuliaStream.jl/Threaded/Project.toml b/JuliaStream.jl/Threaded/Project.toml new file mode 100644 index 0000000..b65bdf5 --- /dev/null +++ b/JuliaStream.jl/Threaded/Project.toml @@ -0,0 +1,6 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/oneAPI/Manifest.toml b/JuliaStream.jl/oneAPI/Manifest.toml new file mode 100644 index 0000000..ca932aa --- /dev/null +++ b/JuliaStream.jl/oneAPI/Manifest.toml @@ -0,0 +1,319 @@ +# This file is machine-generated - editing it directly is not advised + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.33.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.2.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.7+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "2bfc354b5684821dcc88f1e477cefd0dd03c60b5" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.31.20514+0" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "21.2.1+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "b30a895e7ea52991a3f984ab0302c42858d766c0" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.8173+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "92e8eefdd4694597994590230ab329545804bdb3" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.2.0" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "e1d123ff9ada6c469a1eaf57e33a74c3cb26a5a4" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.2.13+0" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "50124857f7e87420655929a9c8ca86749826af11" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.4.1+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/oneAPI/Project.toml b/JuliaStream.jl/oneAPI/Project.toml new file mode 100644 index 0000000..9f89f82 --- /dev/null +++ b/JuliaStream.jl/oneAPI/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index 9a9cd9a..3ed9748 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -6,32 +6,23 @@ const ROCData = StreamData{T,ROCArray{T}} where {T} const TBSize = 1024::Int const DotBlocks = 256::Int -# AMDGPU.agents()'s internal iteration order isn't stable -function gpu_agents_in_repr_order() - # XXX if we select anything other than :gpu, we get - # HSA_STATUS_ERROR_INVALID_AGENT on the first kernel submission - sort(AMDGPU.get_agents(:gpu), by = repr) -end - -function devices() +function devices()::Vector{DeviceWithRepr} try - map(repr, gpu_agents_in_repr_order()) + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), "AMDGPU.jl"), sorted) catch # probably unsupported - [] + String[] end end -function gridsize(data::ROCData{T})::Int where {T} - return data.size -end - function make_stream( arraysize::Int, scalar::T, - device::Int, + device::DeviceWithRepr, silent::Bool, -)::ROCData{T} where {T} +)::Tuple{ROCData{T},Nothing} where {T} if arraysize % TBSize != 0 error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") @@ -39,30 +30,31 @@ function make_stream( # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!() # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand - AMDGPU.DEFAULT_AGENT[] = gpu_agents_in_repr_order()[device] - - data = ROCData{T}( - ROCArray{T}(undef, arraysize), - ROCArray{T}(undef, arraysize), - ROCArray{T}(undef, arraysize), - scalar, - arraysize, - ) + AMDGPU.DEFAULT_AGENT[] = device[1] selected = AMDGPU.get_default_agent() if !silent println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") - println("Kernel parameters : <<<$(gridsize(data)),$(TBSize)>>>") + println("Kernel parameters : <<<$(arraysize),$(TBSize)>>>") end - return data + return ( + ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) end -function init_arrays!(data::ROCData{T}, init::Tuple{T,T,T}) where {T} +function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T} AMDGPU.fill!(data.a, init[1]) AMDGPU.fill!(data.b, init[2]) AMDGPU.fill!(data.c, init[3]) end -function copy!(data::ROCData{T}) where {T} +function copy!(data::ROCData{T}, _) where {T} function kernel(a, c) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] @@ -70,11 +62,11 @@ function copy!(data::ROCData{T}) where {T} end AMDGPU.wait( soft = false, # soft wait causes HSA_REFCOUNT overflow issues - @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.c) + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c) ) end -function mul!(data::ROCData{T}) where {T} +function mul!(data::ROCData{T}, _) where {T} function kernel(b, c, scalar) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds b[i] = scalar * c[i] @@ -82,11 +74,11 @@ function mul!(data::ROCData{T}) where {T} end AMDGPU.wait( soft = false, # soft wait causes HSA_REFCOUNT overflow issues - @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.b, data.c, data.scalar) + @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar) ) end -function add!(data::ROCData{T}) where {T} +function add!(data::ROCData{T}, _) where {T} function kernel(a, b, c) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] + b[i] @@ -94,11 +86,11 @@ function add!(data::ROCData{T}) where {T} end AMDGPU.wait( soft = false, # soft wait causes HSA_REFCOUNT overflow issues - @roc groupsize = TBSize gridsize = gridsize(data) kernel(data.a, data.b, data.c) + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c) ) end -function triad!(data::ROCData{T}) where {T} +function triad!(data::ROCData{T}, _) where {T} function kernel(a, b, c, scalar) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] = b[i] + (scalar * c[i]) @@ -106,7 +98,7 @@ function triad!(data::ROCData{T}) where {T} end AMDGPU.wait( soft = false, # soft wait causes HSA_REFCOUNT overflow issues - @roc groupsize = TBSize gridsize = gridsize(data) kernel( + @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, data.c, @@ -115,7 +107,7 @@ function triad!(data::ROCData{T}) where {T} ) end -function nstream!(data::ROCData{T}) where {T} +function nstream!(data::ROCData{T}, _) where {T} function kernel(a, b, c, scalar) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] += b[i] + scalar * c[i] @@ -123,7 +115,7 @@ function nstream!(data::ROCData{T}) where {T} end AMDGPU.wait( soft = false, # soft wait causes HSA_REFCOUNT overflow issues - @roc groupsize = TBSize gridsize = gridsize(data) kernel( + @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, data.c, @@ -132,7 +124,7 @@ function nstream!(data::ROCData{T}) where {T} ) end -function dot(data::ROCData{T}) where {T} +function dot(data::ROCData{T}, _) where {T} function kernel(a, b, size, partial) tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize)) local_i = workitemIdx().x @@ -174,7 +166,7 @@ function dot(data::ROCData{T}) where {T} return sum(partial_sum) end -function read_data(data::ROCData{T})::VectorData{T} where {T} +function read_data(data::ROCData{T}, _)::VectorData{T} where {T} return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) end diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index 7d671a5..dd4fc44 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -5,51 +5,53 @@ const CuData = StreamData{T,CuArray{T}} where {T} const TBSize = 1024::Int const DotBlocks = 256::Int -function devices() - return !CUDA.functional(false) ? [] : - map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices()) +function devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices()) end function make_stream( arraysize::Int, scalar::T, - device::Int, + device::DeviceWithRepr, silent::Bool, -)::CuData{T} where {T} +)::Tuple{CuData{T},Nothing} where {T} if arraysize % TBSize != 0 error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") end # so CUDA's device is 0 indexed, so -1 from Julia - CUDA.device!(device - 1) + CUDA.device!(device[1]) selected = CUDA.device() # show_reason is set to true here so it dumps CUDA info # for us regardless of whether it's functional if !CUDA.functional(true) error("Non-functional CUDA configuration") end - data = CuData{T}( - CuArray{T}(undef, arraysize), - CuArray{T}(undef, arraysize), - CuArray{T}(undef, arraysize), - scalar, - arraysize, - ) if !silent println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") - println("Kernel parameters: <<<$(data.size ÷ TBSize),$(TBSize)>>>") + println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>") end - return data + return ( + CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) end -function init_arrays!(data::CuData{T}, init::Tuple{T,T,T}) where {T} +function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} CUDA.fill!(data.a, init[1]) CUDA.fill!(data.b, init[2]) CUDA.fill!(data.c, init[3]) end -function copy!(data::CuData{T}) where {T} +function copy!(data::CuData{T}, _) where {T} function kernel(a, c) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds c[i] = a[i] @@ -59,7 +61,7 @@ function copy!(data::CuData{T}) where {T} CUDA.synchronize() end -function mul!(data::CuData{T}) where {T} +function mul!(data::CuData{T}, _) where {T} function kernel(b, c, scalar) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds b[i] = scalar * c[i] @@ -69,7 +71,7 @@ function mul!(data::CuData{T}) where {T} CUDA.synchronize() end -function add!(data::CuData{T}) where {T} +function add!(data::CuData{T}, _) where {T} function kernel(a, b, c) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds c[i] = a[i] + b[i] @@ -79,7 +81,7 @@ function add!(data::CuData{T}) where {T} CUDA.synchronize() end -function triad!(data::CuData{T}) where {T} +function triad!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds a[i] = b[i] + (scalar * c[i]) @@ -94,7 +96,7 @@ function triad!(data::CuData{T}) where {T} CUDA.synchronize() end -function nstream!(data::CuData{T}) where {T} +function nstream!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 @inbounds a[i] += b[i] + scalar * c[i] @@ -109,7 +111,7 @@ function nstream!(data::CuData{T}) where {T} CUDA.synchronize() end -function dot(data::CuData{T}) where {T} +function dot(data::CuData{T}, _) where {T} # direct port of the reduction in CUDAStream.cu function kernel(a, b, size, partial) tb_sum = @cuStaticSharedMem(T, TBSize) @@ -145,7 +147,7 @@ function dot(data::CuData{T}) where {T} return sum(partial_sum) end -function read_data(data::CuData{T})::VectorData{T} where {T} +function read_data(data::CuData{T}, _)::VectorData{T} where {T} return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) end diff --git a/JuliaStream.jl/src/DistributedStream.jl b/JuliaStream.jl/src/DistributedStream.jl index 2aa7ae7..2e80168 100644 --- a/JuliaStream.jl/src/DistributedStream.jl +++ b/JuliaStream.jl/src/DistributedStream.jl @@ -1,39 +1,39 @@ using Distributed @everywhere using Pkg -@everywhere Pkg.activate("."; io=devnull) # don't spam `Activating environment at...` +@everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...` @everywhere include("StreamData.jl") @everywhere include("Stream.jl") @everywhere using SharedArrays @everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} -function devices() - return ["CPU (localhost)"] +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")] end function make_stream( arraysize::Int, scalar::T, - device::Int, + _::DeviceWithRepr, silent::Bool, -)::SharedArrayData{T} where {T} - if device != 1 - error("Only CPU device is supported") - end +)::Tuple{SharedArrayData{T},Nothing} where {T} if !silent println("Using max $(nworkers()) process(es) + 1 master") end - return SharedArrayData{T}( - SharedArray{T}(arraysize), - SharedArray{T}(arraysize), - SharedArray{T}(arraysize), - scalar, - arraysize, + return ( + SharedArrayData{T}( + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + scalar, + arraysize, + ), + nothing, ) end -function init_arrays!(data::SharedArrayData{T}, init::Tuple{T,T,T}) where {T} +function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T} @sync @distributed for i = 1:data.size @inbounds data.a[i] = init[1] @@ -42,43 +42,43 @@ function init_arrays!(data::SharedArrayData{T}, init::Tuple{T,T,T}) where {T} end end -function copy!(data::SharedArrayData{T}) where {T} +function copy!(data::SharedArrayData{T}, _) where {T} @sync @distributed for i = 1:data.size @inbounds data.c[i] = data.a[i] end end -function mul!(data::SharedArrayData{T}) where {T} +function mul!(data::SharedArrayData{T}, _) where {T} @sync @distributed for i = 1:data.size @inbounds data.b[i] = data.scalar * data.c[i] end end -function add!(data::SharedArrayData{T}) where {T} +function add!(data::SharedArrayData{T}, _) where {T} @sync @distributed for i = 1:data.size @inbounds data.c[i] = data.a[i] + data.b[i] end end -function triad!(data::SharedArrayData{T}) where {T} +function triad!(data::SharedArrayData{T}, _) where {T} @sync @distributed for i = 1:data.size @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) end end -function nstream!(data::SharedArrayData{T}) where {T} +function nstream!(data::SharedArrayData{T}, _) where {T} @sync @distributed for i = 1:data.size @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] end end -function dot(data::SharedArrayData{T}) where {T} +function dot(data::SharedArrayData{T}, _) where {T} return @distributed (+) for i = 1:data.size @inbounds data.a[i] * data.b[i] end end -function read_data(data::SharedArrayData{T})::VectorData{T} where {T} +function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T} return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) end diff --git a/JuliaStream.jl/src/KernelAbstractionsStream.jl b/JuliaStream.jl/src/KernelAbstractionsStream.jl new file mode 100644 index 0000000..8cc3699 --- /dev/null +++ b/JuliaStream.jl/src/KernelAbstractionsStream.jl @@ -0,0 +1,255 @@ +using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU +include("Stream.jl") + +const CuData = StreamData{T,CUDA.CuArray{T}} where {T} +const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T} + +const TBSize = 1024::Int +const DotBlocks = 256::Int + +@enum Backend cuda rocm cpu + +struct Context + backend::Backend + device::Device +end + +function list_rocm_devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), rocm), sorted) + catch + # probably unsupported + [] + end +end + +function list_cuda_devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices()) +end + +function devices()::Vector{DeviceWithRepr} + cudas = list_cuda_devices() + rocms = list_rocm_devices() + cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)] + vcat(cpus, cudas, rocms) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +) where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + (selected, _, backend) = device + if backend == cpu + if !silent + println("Using CPU with max $(Threads.nthreads()) threads") + end + partialsum = Vector{T}(undef, DotBlocks) + data = VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CPU() + elseif backend == cuda + CUDA.device!(selected) + if CUDA.device() != selected + error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())") + end + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + end + partialsum = CuArray{T}(undef, DotBlocks) + data = CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CUDADevice() + elseif backend == rocm + AMDGPU.DEFAULT_AGENT[] = selected + if AMDGPU.get_default_agent() != selected + error( + "Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())", + ) + end + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + end + partialsum = ROCArray{T}(undef, DotBlocks) + data = ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = ROCDevice() + else + error("unsupported backend $(backend)") + end + + if !silent + println("Kernel parameters : <<<$(data.size),$(TBSize)>>>") + end + return (data, Context(backend, backenddevice)) +end + +function init_arrays!( + data::StreamData{T,C}, + context::Context, + init::Tuple{T,T,T}, +) where {T,C} + if context.backend == cpu + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end + elseif context.backend == cuda + CUDA.fill!(data.a, init[1]) + CUDA.fill!(data.b, init[2]) + CUDA.fill!(data.c, init[3]) + elseif context.backend == rocm + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) + else + error("unsupported backend $(backend)") + end +end + +function copy!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a), c) + i = @index(Global) + @inbounds c[i] = a[i] + end + wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size)) +end + +function mul!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(b, @Const(c), scalar) + i = @index(Global) + @inbounds b[i] = scalar * c[i] + end + wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size)) +end + +function add!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a), @Const(b), c) + i = @index(Global) + @inbounds c[i] = a[i] + b[i] + end + wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size)) +end + +function triad!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a, @Const(b), @Const(c), scalar) + i = @index(Global) + @inbounds a[i] = b[i] + (scalar * c[i]) + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function nstream!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a, @Const(b), @Const(c), scalar) + i = @index(Global) + @inbounds a[i] += b[i] + scalar * c[i] + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function dot(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a), @Const(b), size, partial) + local_i = @index(Local) + group_i = @index(Group) + tb_sum = @localmem T TBSize + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = @index(Global) + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks + end + + # then tree reduction + # FIXME this does not compile when targeting CPUs: + # see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262 + offset = @private Int64 (1,) + @inbounds begin + offset[1] = @groupsize()[1] ÷ 2 + while offset[1] > 0 + @synchronize + if (local_i - 1) < offset[1] + tb_sum[local_i] += tb_sum[local_i+offset[1]] + end + offset[1] ÷= 2 + end + end + + if (local_i == 1) + @inbounds partial[group_i] = tb_sum[local_i] + end + end + + if context.backend == cpu + partial_sum = Vector{T}(undef, DotBlocks) + elseif context.backend == cuda + partial_sum = CuArray{T}(undef, DotBlocks) + elseif context.backend == rocm + partial_sum = ROCArray{T}(undef, DotBlocks) + else + error("unsupported backend $(backend)") + end + + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.size, + partial_sum, + ndrange = TBSize * DotBlocks, + ), + ) + + return sum(partial_sum) +end + +function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() diff --git a/JuliaStream.jl/src/Stream.jl b/JuliaStream.jl/src/Stream.jl index 590ab2d..b7fc940 100644 --- a/JuliaStream.jl/src/Stream.jl +++ b/JuliaStream.jl/src/Stream.jl @@ -7,6 +7,8 @@ include("StreamData.jl") const VectorData = StreamData{T,Vector{T}} where {T} +const DeviceWithRepr = Tuple{Any,String,Any} + struct Timings copy::Vector{Float64} mul::Vector{Float64} @@ -18,29 +20,33 @@ end @enum Benchmark All Triad Nstream -function run_all!(data::StreamData{T,C}, times::Int)::Tuple{Timings,T} where {T,C} +function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} timings = Timings(times) lastSum::T = 0 for i = 1:times - @inbounds timings.copy[i] = @elapsed copy!(data) - @inbounds timings.mul[i] = @elapsed mul!(data) - @inbounds timings.add[i] = @elapsed add!(data) - @inbounds timings.triad[i] = @elapsed triad!(data) - @inbounds timings.dot[i] = @elapsed lastSum = dot(data) + @inbounds timings.copy[i] = @elapsed copy!(data, context) + @inbounds timings.mul[i] = @elapsed mul!(data, context) + @inbounds timings.add[i] = @elapsed add!(data, context) + @inbounds timings.triad[i] = @elapsed triad!(data, context) + @inbounds timings.dot[i] = @elapsed lastSum = dot(data, context) end return (timings, lastSum) end -function run_triad!(data::StreamData{T,C}, times::Int)::Float64 where {T,C} +function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {T,C} return @elapsed for _ = 1:times - triad!(data) + triad!(data, context) end end -function run_nstream!(data::StreamData{T,C}, times::Int)::Vector{Float64} where {T,C} +function run_nstream!( + data::StreamData{T,C}, + context, + times::Int, +)::Vector{Float64} where {T,C} timings::Vector{Float64} = zeros(times) for i = 1:times - @inbounds timings[i] = @elapsed nstream!(data) + @inbounds timings[i] = @elapsed nstream!(data, context) end return timings end @@ -160,25 +166,23 @@ function main() parse_options(config) if config.list - ds = devices() - for (i, device) in enumerate(ds) - println("[$i] $(device)") + for (i, (_,repr, impl)) in enumerate(devices()) + println("[$i] ($impl) $repr") end exit(0) end ds = devices() + # TODO implement substring device match if config.device < 1 || config.device > length(ds) error( "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", ) + else + device = ds[config.device] end - if config.float - type = Float32 - else - type = Float64 - end + type = config.float ? Float32 : Float64 if config.nstream_only && !config.triad_only benchmark = Nstream @@ -256,12 +260,12 @@ function main() init::Tuple{type,type,type} = DefaultInit scalar::type = DefaultScalar - data = make_stream(config.arraysize, scalar, config.device, config.csv) + (data, context) = make_stream(config.arraysize, scalar, device, config.csv) - init_arrays!(data, init) + init_arrays!(data, context, init) if benchmark == All - (timings, sum) = run_all!(data, config.numtimes) - valid = check_solutions(read_data(data), config.numtimes, init, benchmark, sum) + (timings, sum) = run_all!(data, context, config.numtimes) + valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) tabulate( mk_row(timings.copy, "Copy", 2 * array_bytes), mk_row(timings.mul, "Mul", 2 * array_bytes), @@ -270,12 +274,14 @@ function main() mk_row(timings.dot, "Dot", 2 * array_bytes), ) elseif benchmark == Nstream - timings = run_nstream!(data, config.numtimes) - valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing) + timings = run_nstream!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) elseif benchmark == Triad - elapsed = run_triad!(data, config.numtimes) - valid = check_solutions(read_data(data), config.numtimes, init, benchmark, nothing) + elapsed = run_triad!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) total_bytes = 3 * array_bytes * config.numtimes bandwidth = mega_scale * (total_bytes / elapsed) println("Runtime (seconds): $(round(elapsed; digits=5))") diff --git a/JuliaStream.jl/src/ThreadedStream.jl b/JuliaStream.jl/src/ThreadedStream.jl index fb995e6..4422e66 100644 --- a/JuliaStream.jl/src/ThreadedStream.jl +++ b/JuliaStream.jl/src/ThreadedStream.jl @@ -1,25 +1,31 @@ include("Stream.jl") -function devices() - return ["CPU"] +function devices()::Vector{DeviceWithRepr} + return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")] end function make_stream( arraysize::Int, scalar::T, - device::Int, + _::DeviceWithRepr, silent::Bool, -)::VectorData{T} where {T} - if device != 1 - error("Only CPU device is supported") - end +)::Tuple{VectorData{T},Nothing} where {T} if !silent println("Using max $(Threads.nthreads()) threads") end - return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize) + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) end -function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} Threads.@threads for i = 1:data.size @inbounds data.a[i] = init[1] @inbounds data.b[i] = init[2] @@ -27,37 +33,37 @@ function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} end end -function copy!(data::VectorData{T}) where {T} +function copy!(data::VectorData{T}, _) where {T} Threads.@threads for i = 1:data.size @inbounds data.c[i] = data.a[i] end end -function mul!(data::VectorData{T}) where {T} +function mul!(data::VectorData{T}, _) where {T} Threads.@threads for i = 1:data.size @inbounds data.b[i] = data.scalar * data.c[i] end end -function add!(data::VectorData{T}) where {T} +function add!(data::VectorData{T}, _) where {T} Threads.@threads for i = 1:data.size @inbounds data.c[i] = data.a[i] + data.b[i] end end -function triad!(data::VectorData{T}) where {T} +function triad!(data::VectorData{T}, _) where {T} Threads.@threads for i = 1:data.size @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) end end -function nstream!(data::VectorData{T}) where {T} +function nstream!(data::VectorData{T}, _) where {T} Threads.@threads for i = 1:data.size @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] end end -function dot(data::VectorData{T}) where {T} +function dot(data::VectorData{T}, _) where {T} partial = zeros(T, Threads.nthreads()) Threads.@threads for i = 1:data.size @inbounds partial[Threads.threadid()] += data.a[i] * data.b[i] @@ -65,7 +71,7 @@ function dot(data::VectorData{T}) where {T} return sum(partial) end -function read_data(data::VectorData{T})::VectorData{T} where {T} +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} return data end diff --git a/JuliaStream.jl/src/oneAPIStream.jl b/JuliaStream.jl/src/oneAPIStream.jl new file mode 100644 index 0000000..1bc319d --- /dev/null +++ b/JuliaStream.jl/src/oneAPIStream.jl @@ -0,0 +1,170 @@ +using Base.Iterators: println +using Base.Iterators: println +using Printf: Iterators + +include("Stream.jl") +using oneAPI + +const oneData = StreamData{T,oneArray{T}} where {T} +const DotWGSize = 256::Int + +function devices()::Vector{DeviceWithRepr} + all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect + map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{oneData{T},Int} where {T} + + oneAPI.allowscalar(false) + oneAPI.device!(device[1]) + + props = oneL0.compute_properties(oneAPI.device()) + groupsize = min(props.maxTotalGroupSize, arraysize) + + if arraysize % groupsize != 0 + error("arraysize ($(arraysize)) must be divisible by $(groupsize)!") + end + + if !silent + println("Using L0 device: $(repr("text/plain",device[1]))") + println("Kernel parameters : <<<$(arraysize),$(groupsize)>>>") + end + return ( + oneData{T}( + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + scalar, + arraysize, + ), + groupsize, + ) +end + +function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T} + oneAPI.fill!(data.a, init[1]) + oneAPI.fill!(data.b, init[2]) + oneAPI.fill!(data.c, init[3]) +end + +function copy!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a, c) + i = get_global_id() + @inbounds c[i] = a[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.c, + ) + oneAPI.synchronize() +end + +function mul!(data::oneData{T}, groupsize::Int) where {T} + function kernel(b, c, scalar) + i = get_global_id() + @inbounds b[i] = scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function add!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a, b, c) + i = get_global_id() + @inbounds c[i] = a[i] + b[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + ) + oneAPI.synchronize() +end + +function triad!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a, b, c, scalar) + i = get_global_id() + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function nstream!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a, b, c, scalar) + i = get_global_id() + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function dot(data::oneData{T}, groupsize::Int) where {T} + function kernel(a, b, size, partial) + wg_sum = @LocalMemory(T, (DotWGSize,)) + li = get_local_id() + @inbounds wg_sum[li] = 0.0 + + # do dot first + i = get_global_id() + while i <= size + @inbounds wg_sum[li] += a[i] * b[i] + i += get_global_size() + end + + # then tree reduction + offset = get_local_size() ÷ 2 + while offset > 0 + barrier() + if li <= offset + @inbounds wg_sum[li] += wg_sum[li+offset] + end + offset ÷= 2 + end + + if li == 1 + @inbounds partial[get_group_id()] = wg_sum[li] + end + + return + end + partial_sum = oneArray{T}(undef, groupsize) + @oneapi items = groupsize groups = DotWGSize kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + oneAPI.synchronize() + return sum(partial_sum) +end + +function read_data(data::oneData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/update_all.sh b/JuliaStream.jl/update_all.sh new file mode 100755 index 0000000..ad6c2ee --- /dev/null +++ b/JuliaStream.jl/update_all.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# shellcheck disable=SC2034 disable=SC2153 + +for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" +do + julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' +done \ No newline at end of file From bb271dd046643e7ac83fd3891a261ffb00f4abc5 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 18 Aug 2021 01:59:06 +0100 Subject: [PATCH 12/17] Update PlainStream with context --- JuliaStream.jl/src/PlainStream.jl | 41 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/JuliaStream.jl/src/PlainStream.jl b/JuliaStream.jl/src/PlainStream.jl index 259a9b7..654d6eb 100644 --- a/JuliaStream.jl/src/PlainStream.jl +++ b/JuliaStream.jl/src/PlainStream.jl @@ -1,17 +1,28 @@ include("Stream.jl") -function devices() - return ["CPU"] +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU", "Palin")] end -function make_stream(arraysize::Int, scalar::T, device::Int, silent::Bool)::VectorData{T} where {T} - if device != 1 - error("Only CPU device is supported") - end - return VectorData{T}(1:arraysize, 1:arraysize, 1:arraysize, scalar, arraysize) +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{VectorData{T},Nothing} where {T} + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) end -function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} for i = 1:data.size @inbounds data.a[i] = init[1] @inbounds data.b[i] = init[2] @@ -19,37 +30,37 @@ function init_arrays!(data::VectorData{T}, init::Tuple{T,T,T}) where {T} end end -function copy!(data::VectorData{T}) where {T} +function copy!(data::VectorData{T}, _) where {T} for i = 1:data.size @inbounds data.c[i] = data.a[i] end end -function mul!(data::VectorData{T}) where {T} +function mul!(data::VectorData{T}, _) where {T} for i = 1:data.size @inbounds data.b[i] = data.scalar * data.c[i] end end -function add!(data::VectorData{T}) where {T} +function add!(data::VectorData{T}, _) where {T} for i = 1:data.size @inbounds data.c[i] = data.a[i] + data.b[i] end end -function triad!(data::VectorData{T}) where {T} +function triad!(data::VectorData{T}, _) where {T} for i = 1:data.size @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) end end -function nstream!(data::VectorData{T}) where {T} +function nstream!(data::VectorData{T}, _) where {T} for i = 1:data.size @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] end end -function dot(data::VectorData{T}) where {T} +function dot(data::VectorData{T}, _) where {T} sum = zero(T) for i = 1:data.size @inbounds sum += data.a[i] * data.b[i] @@ -57,7 +68,7 @@ function dot(data::VectorData{T}) where {T} return sum end -function read_data(data::VectorData{T})::VectorData{T} where {T} +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} return data end From c445b646909c95ee25658d7ac7ef8bc0e381f55b Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 18 Aug 2021 02:00:50 +0100 Subject: [PATCH 13/17] Address CUDA comments Drop soft=false for AMDGPU as this option was removed Update dependencies --- JuliaStream.jl/AMDGPU/Manifest.toml | 4 ++-- JuliaStream.jl/CUDA/Manifest.toml | 8 ++++---- .../KernelAbstractions/Manifest.toml | 4 ++-- JuliaStream.jl/oneAPI/Manifest.toml | 4 ++-- JuliaStream.jl/src/AMDGPUStream.jl | 6 ------ JuliaStream.jl/src/CUDAStream.jl | 20 +++++++++---------- 6 files changed, 19 insertions(+), 27 deletions(-) diff --git a/JuliaStream.jl/AMDGPU/Manifest.toml b/JuliaStream.jl/AMDGPU/Manifest.toml index 6e27f0a..5d1a8a7 100644 --- a/JuliaStream.jl/AMDGPU/Manifest.toml +++ b/JuliaStream.jl/AMDGPU/Manifest.toml @@ -115,9 +115,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/CUDA/Manifest.toml b/JuliaStream.jl/CUDA/Manifest.toml index 7330228..af0acfc 100644 --- a/JuliaStream.jl/CUDA/Manifest.toml +++ b/JuliaStream.jl/CUDA/Manifest.toml @@ -40,9 +40,9 @@ version = "0.4.1" [[CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "9303b20dfa74e4bcb4da425d351d551fbb5850be" +git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.0" +version = "3.4.1" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] @@ -122,9 +122,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] diff --git a/JuliaStream.jl/KernelAbstractions/Manifest.toml b/JuliaStream.jl/KernelAbstractions/Manifest.toml index 5c24cf5..25fd8d1 100644 --- a/JuliaStream.jl/KernelAbstractions/Manifest.toml +++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -185,9 +185,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] diff --git a/JuliaStream.jl/oneAPI/Manifest.toml b/JuliaStream.jl/oneAPI/Manifest.toml index ca932aa..3aab94b 100644 --- a/JuliaStream.jl/oneAPI/Manifest.toml +++ b/JuliaStream.jl/oneAPI/Manifest.toml @@ -104,9 +104,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index 3ed9748..8347637 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -61,7 +61,6 @@ function copy!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c) ) end @@ -73,7 +72,6 @@ function mul!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar) ) end @@ -85,7 +83,6 @@ function add!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c) ) end @@ -97,7 +94,6 @@ function triad!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, @@ -114,7 +110,6 @@ function nstream!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, @@ -155,7 +150,6 @@ function dot(data::ROCData{T}, _) where {T} end partial_sum = ROCArray{T}(undef, DotBlocks) AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( data.a, data.b, diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index dd4fc44..b46b3c9 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -21,7 +21,6 @@ function make_stream( error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") end - # so CUDA's device is 0 indexed, so -1 from Julia CUDA.device!(device[1]) selected = CUDA.device() # show_reason is set to true here so it dumps CUDA info @@ -46,14 +45,14 @@ function make_stream( end function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} - CUDA.fill!(data.a, init[1]) - CUDA.fill!(data.b, init[2]) - CUDA.fill!(data.c, init[3]) + fill!(data.a, init[1]) + fill!(data.b, init[2]) + fill!(data.c, init[3]) end function copy!(data::CuData{T}, _) where {T} function kernel(a, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] return end @@ -63,7 +62,7 @@ end function mul!(data::CuData{T}, _) where {T} function kernel(b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds b[i] = scalar * c[i] return end @@ -73,7 +72,7 @@ end function add!(data::CuData{T}, _) where {T} function kernel(a, b, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] + b[i] return end @@ -83,7 +82,7 @@ end function triad!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] = b[i] + (scalar * c[i]) return end @@ -98,7 +97,7 @@ end function nstream!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] += b[i] + scalar * c[i] return end @@ -119,7 +118,7 @@ function dot(data::CuData{T}, _) where {T} @inbounds tb_sum[local_i] = 0.0 # do dot first - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x while i <= size @inbounds tb_sum[local_i] += a[i] * b[i] i += blockDim().x * gridDim().x @@ -143,7 +142,6 @@ function dot(data::CuData{T}, _) where {T} end partial_sum = CuArray{T}(undef, DotBlocks) @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum) - CUDA.synchronize() return sum(partial_sum) end From 4853457dca6e3c5ce94f83f3227cd69d49bacba4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 27 Aug 2021 14:04:58 +0100 Subject: [PATCH 14/17] Add type annotation for all kernels Update dependencies --- JuliaStream.jl/AMDGPU/Manifest.toml | 8 ++++---- JuliaStream.jl/CUDA/Manifest.toml | 16 +++++++-------- .../KernelAbstractions/Manifest.toml | 20 +++++++++---------- JuliaStream.jl/Manifest.toml | 12 +++++------ JuliaStream.jl/oneAPI/Manifest.toml | 12 +++++------ JuliaStream.jl/src/AMDGPUStream.jl | 12 +++++------ JuliaStream.jl/src/CUDAStream.jl | 12 +++++------ .../src/KernelAbstractionsStream.jl | 12 +++++------ JuliaStream.jl/src/oneAPIStream.jl | 12 +++++------ 9 files changed, 58 insertions(+), 58 deletions(-) diff --git a/JuliaStream.jl/AMDGPU/Manifest.toml b/JuliaStream.jl/AMDGPU/Manifest.toml index 5d1a8a7..6525501 100644 --- a/JuliaStream.jl/AMDGPU/Manifest.toml +++ b/JuliaStream.jl/AMDGPU/Manifest.toml @@ -109,15 +109,15 @@ version = "1.3.0" [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.2.0" +version = "4.4.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.8+0" +version = "0.0.10+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/CUDA/Manifest.toml b/JuliaStream.jl/CUDA/Manifest.toml index af0acfc..ef6da14 100644 --- a/JuliaStream.jl/CUDA/Manifest.toml +++ b/JuliaStream.jl/CUDA/Manifest.toml @@ -52,9 +52,9 @@ version = "1.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.33.0" +version = "3.34.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -116,15 +116,15 @@ version = "1.3.0" [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.2.0" +version = "4.4.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.8+0" +version = "0.0.10+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] @@ -231,9 +231,9 @@ uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" [[Reexport]] -git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.1.0" +version = "1.2.2" [[Requires]] deps = ["UUIDs"] diff --git a/JuliaStream.jl/KernelAbstractions/Manifest.toml b/JuliaStream.jl/KernelAbstractions/Manifest.toml index 25fd8d1..bfc562f 100644 --- a/JuliaStream.jl/KernelAbstractions/Manifest.toml +++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -69,9 +69,9 @@ uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" version = "0.3.0" [[Cassette]] -git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504" +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.7" +version = "0.3.8" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] @@ -81,9 +81,9 @@ version = "1.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.33.0" +version = "3.34.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -179,15 +179,15 @@ version = "0.7.0" [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.2.0" +version = "4.4.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.8+0" +version = "0.0.10+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] @@ -354,9 +354,9 @@ uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" [[Reexport]] -git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.1.0" +version = "1.2.2" [[Requires]] deps = ["UUIDs"] diff --git a/JuliaStream.jl/Manifest.toml b/JuliaStream.jl/Manifest.toml index 14f2029..c096e05 100644 --- a/JuliaStream.jl/Manifest.toml +++ b/JuliaStream.jl/Manifest.toml @@ -69,9 +69,9 @@ uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" version = "0.3.0" [[Cassette]] -git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504" +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.7" +version = "0.3.8" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] @@ -81,9 +81,9 @@ version = "1.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.33.0" +version = "3.34.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -312,9 +312,9 @@ uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" [[Reexport]] -git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.1.0" +version = "1.2.2" [[Requires]] deps = ["UUIDs"] diff --git a/JuliaStream.jl/oneAPI/Manifest.toml b/JuliaStream.jl/oneAPI/Manifest.toml index 3aab94b..82c40fd 100644 --- a/JuliaStream.jl/oneAPI/Manifest.toml +++ b/JuliaStream.jl/oneAPI/Manifest.toml @@ -34,9 +34,9 @@ version = "1.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "79b9563ef3f2cc5fc6d3046a5ee1a57c9de52495" +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.33.0" +version = "3.34.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -98,15 +98,15 @@ version = "1.3.0" [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "d6041ad706cf458b2c9f3e501152488a26451e9c" +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.2.0" +version = "4.4.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.8+0" +version = "0.0.10+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index 8347637..4dd220c 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -55,7 +55,7 @@ function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T} end function copy!(data::ROCData{T}, _) where {T} - function kernel(a, c) + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] return @@ -66,7 +66,7 @@ function copy!(data::ROCData{T}, _) where {T} end function mul!(data::ROCData{T}, _) where {T} - function kernel(b, c, scalar) + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds b[i] = scalar * c[i] return @@ -77,7 +77,7 @@ function mul!(data::ROCData{T}, _) where {T} end function add!(data::ROCData{T}, _) where {T} - function kernel(a, b, c) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds c[i] = a[i] + b[i] return @@ -88,7 +88,7 @@ function add!(data::ROCData{T}, _) where {T} end function triad!(data::ROCData{T}, _) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] = b[i] + (scalar * c[i]) return @@ -104,7 +104,7 @@ function triad!(data::ROCData{T}, _) where {T} end function nstream!(data::ROCData{T}, _) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 @inbounds a[i] += b[i] + scalar * c[i] return @@ -120,7 +120,7 @@ function nstream!(data::ROCData{T}, _) where {T} end function dot(data::ROCData{T}, _) where {T} - function kernel(a, b, size, partial) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize)) local_i = workitemIdx().x @inbounds tb_sum[local_i] = 0.0 diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index b46b3c9..da3698e 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -51,7 +51,7 @@ function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} end function copy!(data::CuData{T}, _) where {T} - function kernel(a, c) + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] return @@ -61,7 +61,7 @@ function copy!(data::CuData{T}, _) where {T} end function mul!(data::CuData{T}, _) where {T} - function kernel(b, c, scalar) + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds b[i] = scalar * c[i] return @@ -71,7 +71,7 @@ function mul!(data::CuData{T}, _) where {T} end function add!(data::CuData{T}, _) where {T} - function kernel(a, b, c) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] + b[i] return @@ -81,7 +81,7 @@ function add!(data::CuData{T}, _) where {T} end function triad!(data::CuData{T}, _) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] = b[i] + (scalar * c[i]) return @@ -96,7 +96,7 @@ function triad!(data::CuData{T}, _) where {T} end function nstream!(data::CuData{T}, _) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] += b[i] + scalar * c[i] return @@ -112,7 +112,7 @@ end function dot(data::CuData{T}, _) where {T} # direct port of the reduction in CUDAStream.cu - function kernel(a, b, size, partial) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) tb_sum = @cuStaticSharedMem(T, TBSize) local_i = threadIdx().x @inbounds tb_sum[local_i] = 0.0 diff --git a/JuliaStream.jl/src/KernelAbstractionsStream.jl b/JuliaStream.jl/src/KernelAbstractionsStream.jl index 8cc3699..2b9d9ad 100644 --- a/JuliaStream.jl/src/KernelAbstractionsStream.jl +++ b/JuliaStream.jl/src/KernelAbstractionsStream.jl @@ -136,7 +136,7 @@ function init_arrays!( end function copy!(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(@Const(a), c) + @kernel function kernel(@Const(a::AbstractArray{T}), c) i = @index(Global) @inbounds c[i] = a[i] end @@ -144,7 +144,7 @@ function copy!(data::StreamData{T,C}, context::Context) where {T,C} end function mul!(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(b, @Const(c), scalar) + @kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T) i = @index(Global) @inbounds b[i] = scalar * c[i] end @@ -152,7 +152,7 @@ function mul!(data::StreamData{T,C}, context::Context) where {T,C} end function add!(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(@Const(a), @Const(b), c) + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c) i = @index(Global) @inbounds c[i] = a[i] + b[i] end @@ -160,7 +160,7 @@ function add!(data::StreamData{T,C}, context::Context) where {T,C} end function triad!(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(a, @Const(b), @Const(c), scalar) + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) i = @index(Global) @inbounds a[i] = b[i] + (scalar * c[i]) end @@ -176,7 +176,7 @@ function triad!(data::StreamData{T,C}, context::Context) where {T,C} end function nstream!(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(a, @Const(b), @Const(c), scalar) + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) i = @index(Global) @inbounds a[i] += b[i] + scalar * c[i] end @@ -192,7 +192,7 @@ function nstream!(data::StreamData{T,C}, context::Context) where {T,C} end function dot(data::StreamData{T,C}, context::Context) where {T,C} - @kernel function kernel(@Const(a), @Const(b), size, partial) + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T}) local_i = @index(Local) group_i = @index(Group) tb_sum = @localmem T TBSize diff --git a/JuliaStream.jl/src/oneAPIStream.jl b/JuliaStream.jl/src/oneAPIStream.jl index 1bc319d..83f100e 100644 --- a/JuliaStream.jl/src/oneAPIStream.jl +++ b/JuliaStream.jl/src/oneAPIStream.jl @@ -53,7 +53,7 @@ function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T} end function copy!(data::oneData{T}, groupsize::Int) where {T} - function kernel(a, c) + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) i = get_global_id() @inbounds c[i] = a[i] return @@ -66,7 +66,7 @@ function copy!(data::oneData{T}, groupsize::Int) where {T} end function mul!(data::oneData{T}, groupsize::Int) where {T} - function kernel(b, c, scalar) + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = get_global_id() @inbounds b[i] = scalar * c[i] return @@ -80,7 +80,7 @@ function mul!(data::oneData{T}, groupsize::Int) where {T} end function add!(data::oneData{T}, groupsize::Int) where {T} - function kernel(a, b, c) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) i = get_global_id() @inbounds c[i] = a[i] + b[i] return @@ -94,7 +94,7 @@ function add!(data::oneData{T}, groupsize::Int) where {T} end function triad!(data::oneData{T}, groupsize::Int) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = get_global_id() @inbounds a[i] = b[i] + (scalar * c[i]) return @@ -109,7 +109,7 @@ function triad!(data::oneData{T}, groupsize::Int) where {T} end function nstream!(data::oneData{T}, groupsize::Int) where {T} - function kernel(a, b, c, scalar) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) i = get_global_id() @inbounds a[i] += b[i] + scalar * c[i] return @@ -124,7 +124,7 @@ function nstream!(data::oneData{T}, groupsize::Int) where {T} end function dot(data::oneData{T}, groupsize::Int) where {T} - function kernel(a, b, size, partial) + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) wg_sum = @LocalMemory(T, (DotWGSize,)) li = get_local_id() @inbounds wg_sum[li] = 0.0 From 13cb8ffced2033d86875aa4e56de91c7c5da9926 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 28 Aug 2021 11:10:49 +0100 Subject: [PATCH 15/17] Use custom static reduction for CPU --- JuliaStream.jl/src/ThreadedStream.jl | 40 +++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/JuliaStream.jl/src/ThreadedStream.jl b/JuliaStream.jl/src/ThreadedStream.jl index 4422e66..0faabeb 100644 --- a/JuliaStream.jl/src/ThreadedStream.jl +++ b/JuliaStream.jl/src/ThreadedStream.jl @@ -63,12 +63,46 @@ function nstream!(data::VectorData{T}, _) where {T} end end +# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc +function static_par_ranged(f::Function, range::Int, n::Int) + stride = range ÷ n + rem = range % n + strides = map(0:n) do i + width = stride + (i < rem ? 1 : 0) + offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem)) + (offset, width) + end + ccall(:jl_enter_threaded_region, Cvoid, ()) + try + foreach(wait, map(1:n) do group + (offset, size) = strides[group] + task = Task(() -> f(group, offset+1, offset+size)) + task.sticky = true + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group + schedule(task) + end) + finally + ccall(:jl_exit_threaded_region, Cvoid, ()) + end +end + function dot(data::VectorData{T}, _) where {T} - partial = zeros(T, Threads.nthreads()) - Threads.@threads for i = 1:data.size - @inbounds partial[Threads.threadid()] += data.a[i] * data.b[i] + partial = Vector{T}(undef, Threads.nthreads()) + static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx + acc = zero(T) + @fastmath for i = startidx:endidx + @inbounds acc += data.a[i] * data.b[i] + end + @inbounds partial[group] = acc end return sum(partial) + # This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall + # and inhibited vectorisation from the lack of @fastmath + # partial = zeros(T, Threads.nthreads()) + # Threads.@threads for i = 1:data.size + # @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i]) + # end + # return sum(partial) end function read_data(data::VectorData{T}, _)::VectorData{T} where {T} From 41f17673650e94d53f691ca9c6a848ffed70f346 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 28 Aug 2021 11:16:19 +0100 Subject: [PATCH 16/17] Pause GC during benchmark to reduce noise --- JuliaStream.jl/src/Stream.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/JuliaStream.jl/src/Stream.jl b/JuliaStream.jl/src/Stream.jl index b7fc940..1905c81 100644 --- a/JuliaStream.jl/src/Stream.jl +++ b/JuliaStream.jl/src/Stream.jl @@ -260,8 +260,9 @@ function main() init::Tuple{type,type,type} = DefaultInit scalar::type = DefaultScalar - (data, context) = make_stream(config.arraysize, scalar, device, config.csv) + GC.enable(false) + (data, context) = make_stream(config.arraysize, scalar, device, config.csv) init_arrays!(data, context, init) if benchmark == All (timings, sum) = run_all!(data, context, config.numtimes) @@ -289,6 +290,8 @@ function main() else error("Bad benchmark $(benchmark)") end + + GC.enable(true) if !valid exit(1) From 78b52a496c26bb2d7b89d5d3d24c22d98992c077 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 28 Aug 2021 11:39:08 +0100 Subject: [PATCH 17/17] Use @simd instead of @fastmath for CPU reduction --- JuliaStream.jl/src/ThreadedStream.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/JuliaStream.jl/src/ThreadedStream.jl b/JuliaStream.jl/src/ThreadedStream.jl index 0faabeb..f282fda 100644 --- a/JuliaStream.jl/src/ThreadedStream.jl +++ b/JuliaStream.jl/src/ThreadedStream.jl @@ -90,14 +90,14 @@ function dot(data::VectorData{T}, _) where {T} partial = Vector{T}(undef, Threads.nthreads()) static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx acc = zero(T) - @fastmath for i = startidx:endidx + @simd for i = startidx:endidx @inbounds acc += data.a[i] * data.b[i] end @inbounds partial[group] = acc end return sum(partial) # This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall - # and inhibited vectorisation from the lack of @fastmath + # and inhibited vectorisation from the lack of @simd # partial = zeros(T, Threads.nthreads()) # Threads.@threads for i = 1:data.size # @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i])