diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 5bed7b8..b7cc493 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -3,7 +3,50 @@ on: [push, pull_request] jobs: - "test_compile_cpp": + + test-java: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./java-stream + steps: + - uses: actions/checkout@v2 + - name: Test build project + run: ./mvnw clean package + - name: Test run + if: ${{ ! cancelled() }} + run: java -jar target/java-stream.jar --arraysize 2048 + + test-julia: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./JuliaStream.jl + steps: + - uses: actions/checkout@v2 + - name: Setup project + run: julia --project -e 'import Pkg; Pkg.instantiate()' + - name: Test run PlainStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/PlainStream.jl --arraysize 2048 + - name: Test run ThreadedStream.jl + if: ${{ ! cancelled() }} + run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048 + - name: Test run DistributedStream.jl (no flag) + if: ${{ ! cancelled() }} + run: julia --project src/DistributedStream.jl --arraysize 2048 + - name: Test run DistributedStream.jl (-p 2) + if: ${{ ! cancelled() }} + run: julia -p 2 --project src/DistributedStream.jl --arraysize 2048 + - name: Test run CUDAStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/CUDAStream.jl --list + - name: Test run AMDGPUStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/AMDGPUStream.jl --list + + + test-cpp: runs-on: ubuntu-18.04 defaults: run: diff --git a/.gitignore b/.gitignore index 614eb0f..012d0e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,27 @@ +cuda-stream +ocl-stream +omp-stream +acc-stream +raja-stream +kokkos-stream +std-stream +sycl-stream +hip-stream +tbb-stream + +*.o +*.bc +*.sycl +*.tar +*.gz +*.a + +KokkosCore_config.* + +.DS_Store + +Makefile build/ cmake-build-*/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dbabed..976964a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. - SYCL device check for FP64 support. +- New implementation using TBB. +- Compiler options for Fujitsu added to OpenMP. ### Changed - Default branch renamed from `master` to `main`. diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..cac63ea --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ +cff-version: 1.1.0 +message: If you use this software, please cite it as below. +authors: + - family-names: Deakin + given-names: Tom + affiliation: University of Bristol + website: https://hpc.tomdeakin.com + - family-names: McIntosh-Smith + given-names: Simon + affiliation: University of Bristol + website: https://uob-hpc.github.io +title: BabelStream +version: 3.4 +date-released: 2019-04-10 diff --git a/JuliaStream.jl/.JuliaFormatter.toml b/JuliaStream.jl/.JuliaFormatter.toml new file mode 100644 index 0000000..ac95ddd --- /dev/null +++ b/JuliaStream.jl/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +indent = 2 +margin = 100 \ No newline at end of file diff --git a/JuliaStream.jl/.gitignore b/JuliaStream.jl/.gitignore new file mode 100644 index 0000000..12b143b --- /dev/null +++ b/JuliaStream.jl/.gitignore @@ -0,0 +1,5 @@ +*.jl.cov +*.jl.*.cov +*.jl.mem +/docs/build/ +/docs/Manifest.toml \ No newline at end of file diff --git a/JuliaStream.jl/AMDGPU/Manifest.toml b/JuliaStream.jl/AMDGPU/Manifest.toml new file mode 100644 index 0000000..6525501 --- /dev/null +++ b/JuliaStream.jl/AMDGPU/Manifest.toml @@ -0,0 +1,415 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/AMDGPU/Project.toml b/JuliaStream.jl/AMDGPU/Project.toml new file mode 100644 index 0000000..5ab8447 --- /dev/null +++ b/JuliaStream.jl/AMDGPU/Project.toml @@ -0,0 +1,7 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/CUDA/Manifest.toml b/JuliaStream.jl/CUDA/Manifest.toml new file mode 100644 index 0000000..ef6da14 --- /dev/null +++ b/JuliaStream.jl/CUDA/Manifest.toml @@ -0,0 +1,316 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.4.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/CUDA/Project.toml b/JuliaStream.jl/CUDA/Project.toml new file mode 100644 index 0000000..e50582e --- /dev/null +++ b/JuliaStream.jl/CUDA/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/KernelAbstractions/Manifest.toml b/JuliaStream.jl/KernelAbstractions/Manifest.toml new file mode 100644 index 0000000..bfc562f --- /dev/null +++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -0,0 +1,547 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "5e696e37e51b01ae07bd9f700afe6cbd55250bce" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.3.4" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.8" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/KernelAbstractions/Project.toml b/JuliaStream.jl/KernelAbstractions/Project.toml new file mode 100644 index 0000000..71715ff --- /dev/null +++ b/JuliaStream.jl/KernelAbstractions/Project.toml @@ -0,0 +1,11 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/Manifest.toml b/JuliaStream.jl/Manifest.toml new file mode 100644 index 0000000..c096e05 --- /dev/null +++ b/JuliaStream.jl/Manifest.toml @@ -0,0 +1,493 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.7" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.2.1" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.8" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "6.4.1" + +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.11.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.9.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.12.19358+0" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.1.0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "20.3.2+0" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "3.7.0+1" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "3.8.0+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.6712+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.1.3" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.1.2+1" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.3.6+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/Project.toml b/JuliaStream.jl/Project.toml new file mode 100644 index 0000000..9c7d49d --- /dev/null +++ b/JuliaStream.jl/Project.toml @@ -0,0 +1,19 @@ +name = "JuliaStream" +uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681" +authors = ["Wei-Chen Lin "] +version = "3.4.0" + +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/README.md b/JuliaStream.jl/README.md new file mode 100644 index 0000000..6204da7 --- /dev/null +++ b/JuliaStream.jl/README.md @@ -0,0 +1,67 @@ +JuliaStream.jl +============== + +This is an implementation of BabelStream in Julia which contains the following variants: + + * `PlainStream.jl` - Single threaded `for` + * `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros + * `DistributedStream.jl` - Process based parallelism with `@distributed` macros + * `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) + * `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) + * `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) + * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) + +### Build & Run + +Prerequisites + + * Julia >= 1.6+ + +A set of reduced dependency projects are available for the following backend and implementations: + + * `AMDGPU` supports: + - `AMDGPUStream.jl` + * `CUDA` supports: + - `CUDAStream.jl` + * `oneAPI` supports: + - `oneAPIStream.jl` + * `KernelAbstractions` supports: + - `KernelAbstractionsStream.jl` + * `Threaded` supports: + - `PlainStream.jl` + - `ThreadedStream.jl` + - `DistributedStream.jl` + +With Julia on path, run your selected benchmark with: + +```shell +> cd JuliaStream.jl +> julia --project= -e 'import Pkg; Pkg.instantiate()' # only required on first run +> julia --project= src/Stream.jl +``` + +For example. to run the CUDA implementation: + +```shell +> cd JuliaStream.jl +> julia --project=CUDA -e 'import Pkg; Pkg.instantiate()' +> julia --project=CUDA src/CUDAStream.jl +``` + +**Important:** + * Julia is 1-indexed, so N >= 1 in `--device N`. + * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. + * Worker count for `DistributedStream` is set with `-p ` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). + * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. + +*** + +Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`. +There may be instances where some packages are locked to an older version because of transitive dependency requirements. + +To run the benchmark using the top-level project, run the benchmark with: +```shell +> cd JuliaStream.jl +> julia --project -e 'import Pkg; Pkg.instantiate()' +> julia --project src/Stream.jl +``` \ No newline at end of file diff --git a/JuliaStream.jl/Threaded/Manifest.toml b/JuliaStream.jl/Threaded/Manifest.toml new file mode 100644 index 0000000..608e2da --- /dev/null +++ b/JuliaStream.jl/Threaded/Manifest.toml @@ -0,0 +1,31 @@ +# This file is machine-generated - editing it directly is not advised + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" diff --git a/JuliaStream.jl/Threaded/Project.toml b/JuliaStream.jl/Threaded/Project.toml new file mode 100644 index 0000000..b65bdf5 --- /dev/null +++ b/JuliaStream.jl/Threaded/Project.toml @@ -0,0 +1,6 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/oneAPI/Manifest.toml b/JuliaStream.jl/oneAPI/Manifest.toml new file mode 100644 index 0000000..82c40fd --- /dev/null +++ b/JuliaStream.jl/oneAPI/Manifest.toml @@ -0,0 +1,319 @@ +# This file is machine-generated - editing it directly is not advised + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "2bfc354b5684821dcc88f1e477cefd0dd03c60b5" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.31.20514+0" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "21.2.1+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "b30a895e7ea52991a3f984ab0302c42858d766c0" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.8173+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "92e8eefdd4694597994590230ab329545804bdb3" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.2.0" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "e1d123ff9ada6c469a1eaf57e33a74c3cb26a5a4" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.2.13+0" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "50124857f7e87420655929a9c8ca86749826af11" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.4.1+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/JuliaStream.jl/oneAPI/Project.toml b/JuliaStream.jl/oneAPI/Project.toml new file mode 100644 index 0000000..9f89f82 --- /dev/null +++ b/JuliaStream.jl/oneAPI/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl new file mode 100644 index 0000000..4dd220c --- /dev/null +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -0,0 +1,167 @@ +# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs +include("Stream.jl") +using AMDGPU + +const ROCData = StreamData{T,ROCArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +function devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), "AMDGPU.jl"), sorted) + catch + # probably unsupported + String[] + end +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{ROCData{T},Nothing} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!() + # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand + AMDGPU.DEFAULT_AGENT[] = device[1] + selected = AMDGPU.get_default_agent() + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + println("Kernel parameters : <<<$(arraysize),$(TBSize)>>>") + end + return ( + ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T} + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) +end + +function copy!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds c[i] = a[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c) + ) +end + +function mul!(data::ROCData{T}, _) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds b[i] = scalar * c[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar) + ) +end + +function add!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds c[i] = a[i] + b[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c) + ) +end + +function triad!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function nstream!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds a[i] += b[i] + scalar * c[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function dot(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize)) + local_i = workitemIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here + end + + # then tree reduction + offset = workgroupDim().x ÷ 2 + while offset > 0 + sync_workgroup() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[workgroupIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = ROCArray{T}(undef, DotBlocks) + AMDGPU.wait( + @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + ) + return sum(partial_sum) +end + +function read_data(data::ROCData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl new file mode 100644 index 0000000..da3698e --- /dev/null +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -0,0 +1,152 @@ +include("Stream.jl") +using CUDA + +const CuData = StreamData{T,CuArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +function devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices()) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{CuData{T},Nothing} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + CUDA.device!(device[1]) + selected = CUDA.device() + # show_reason is set to true here so it dumps CUDA info + # for us regardless of whether it's functional + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>") + end + return ( + CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} + fill!(data.a, init[1]) + fill!(data.b, init[2]) + fill!(data.c, init[3]) +end + +function copy!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c) + CUDA.synchronize() +end + +function mul!(data::CuData{T}, _) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds b[i] = scalar * c[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar) + CUDA.synchronize() +end + +function add!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + b[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c) + CUDA.synchronize() +end + +function triad!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + CUDA.synchronize() +end + +function nstream!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + CUDA.synchronize() +end + +function dot(data::CuData{T}, _) where {T} + # direct port of the reduction in CUDAStream.cu + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + tb_sum = @cuStaticSharedMem(T, TBSize) + local_i = threadIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += blockDim().x * gridDim().x + end + + # then tree reduction + offset = blockDim().x ÷ 2 + while offset > 0 + sync_threads() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[blockIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = CuArray{T}(undef, DotBlocks) + @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum) + return sum(partial_sum) +end + +function read_data(data::CuData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/DistributedStream.jl b/JuliaStream.jl/src/DistributedStream.jl new file mode 100644 index 0000000..2e80168 --- /dev/null +++ b/JuliaStream.jl/src/DistributedStream.jl @@ -0,0 +1,85 @@ +using Distributed + +@everywhere using Pkg +@everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...` +@everywhere include("StreamData.jl") +@everywhere include("Stream.jl") +@everywhere using SharedArrays +@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} + +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{SharedArrayData{T},Nothing} where {T} + + if !silent + println("Using max $(nworkers()) process(es) + 1 master") + end + return ( + SharedArrayData{T}( + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T} + + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::SharedArrayData{T}, _) where {T} + return @distributed (+) for i = 1:data.size + @inbounds data.a[i] * data.b[i] + end +end + +function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/JuliaStream.jl b/JuliaStream.jl/src/JuliaStream.jl new file mode 100644 index 0000000..e01d46d --- /dev/null +++ b/JuliaStream.jl/src/JuliaStream.jl @@ -0,0 +1,4 @@ +module JuliaStream +end + +println("Please run benchmarks directly via `julia --project src/Stream.jl`") \ No newline at end of file diff --git a/JuliaStream.jl/src/KernelAbstractionsStream.jl b/JuliaStream.jl/src/KernelAbstractionsStream.jl new file mode 100644 index 0000000..2b9d9ad --- /dev/null +++ b/JuliaStream.jl/src/KernelAbstractionsStream.jl @@ -0,0 +1,255 @@ +using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU +include("Stream.jl") + +const CuData = StreamData{T,CUDA.CuArray{T}} where {T} +const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T} + +const TBSize = 1024::Int +const DotBlocks = 256::Int + +@enum Backend cuda rocm cpu + +struct Context + backend::Backend + device::Device +end + +function list_rocm_devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), rocm), sorted) + catch + # probably unsupported + [] + end +end + +function list_cuda_devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices()) +end + +function devices()::Vector{DeviceWithRepr} + cudas = list_cuda_devices() + rocms = list_rocm_devices() + cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)] + vcat(cpus, cudas, rocms) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +) where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + (selected, _, backend) = device + if backend == cpu + if !silent + println("Using CPU with max $(Threads.nthreads()) threads") + end + partialsum = Vector{T}(undef, DotBlocks) + data = VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CPU() + elseif backend == cuda + CUDA.device!(selected) + if CUDA.device() != selected + error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())") + end + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + end + partialsum = CuArray{T}(undef, DotBlocks) + data = CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CUDADevice() + elseif backend == rocm + AMDGPU.DEFAULT_AGENT[] = selected + if AMDGPU.get_default_agent() != selected + error( + "Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())", + ) + end + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + end + partialsum = ROCArray{T}(undef, DotBlocks) + data = ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = ROCDevice() + else + error("unsupported backend $(backend)") + end + + if !silent + println("Kernel parameters : <<<$(data.size),$(TBSize)>>>") + end + return (data, Context(backend, backenddevice)) +end + +function init_arrays!( + data::StreamData{T,C}, + context::Context, + init::Tuple{T,T,T}, +) where {T,C} + if context.backend == cpu + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end + elseif context.backend == cuda + CUDA.fill!(data.a, init[1]) + CUDA.fill!(data.b, init[2]) + CUDA.fill!(data.c, init[3]) + elseif context.backend == rocm + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) + else + error("unsupported backend $(backend)") + end +end + +function copy!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), c) + i = @index(Global) + @inbounds c[i] = a[i] + end + wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size)) +end + +function mul!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T) + i = @index(Global) + @inbounds b[i] = scalar * c[i] + end + wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size)) +end + +function add!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c) + i = @index(Global) + @inbounds c[i] = a[i] + b[i] + end + wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size)) +end + +function triad!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) + i = @index(Global) + @inbounds a[i] = b[i] + (scalar * c[i]) + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function nstream!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) + i = @index(Global) + @inbounds a[i] += b[i] + scalar * c[i] + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function dot(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T}) + local_i = @index(Local) + group_i = @index(Group) + tb_sum = @localmem T TBSize + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = @index(Global) + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks + end + + # then tree reduction + # FIXME this does not compile when targeting CPUs: + # see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262 + offset = @private Int64 (1,) + @inbounds begin + offset[1] = @groupsize()[1] ÷ 2 + while offset[1] > 0 + @synchronize + if (local_i - 1) < offset[1] + tb_sum[local_i] += tb_sum[local_i+offset[1]] + end + offset[1] ÷= 2 + end + end + + if (local_i == 1) + @inbounds partial[group_i] = tb_sum[local_i] + end + end + + if context.backend == cpu + partial_sum = Vector{T}(undef, DotBlocks) + elseif context.backend == cuda + partial_sum = CuArray{T}(undef, DotBlocks) + elseif context.backend == rocm + partial_sum = ROCArray{T}(undef, DotBlocks) + else + error("unsupported backend $(backend)") + end + + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.size, + partial_sum, + ndrange = TBSize * DotBlocks, + ), + ) + + return sum(partial_sum) +end + +function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() diff --git a/JuliaStream.jl/src/PlainStream.jl b/JuliaStream.jl/src/PlainStream.jl new file mode 100644 index 0000000..654d6eb --- /dev/null +++ b/JuliaStream.jl/src/PlainStream.jl @@ -0,0 +1,75 @@ +include("Stream.jl") + +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU", "Palin")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{VectorData{T},Nothing} where {T} + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) +end + +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} + for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::VectorData{T}, _) where {T} + sum = zero(T) + for i = 1:data.size + @inbounds sum += data.a[i] * data.b[i] + end + return sum +end + +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/Stream.jl b/JuliaStream.jl/src/Stream.jl new file mode 100644 index 0000000..1905c81 --- /dev/null +++ b/JuliaStream.jl/src/Stream.jl @@ -0,0 +1,300 @@ +using ArgParse +using Parameters +using Printf +using Base: Float64, Int + +include("StreamData.jl") + +const VectorData = StreamData{T,Vector{T}} where {T} + +const DeviceWithRepr = Tuple{Any,String,Any} + +struct Timings + copy::Vector{Float64} + mul::Vector{Float64} + add::Vector{Float64} + triad::Vector{Float64} + dot::Vector{Float64} + Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n)) +end + +@enum Benchmark All Triad Nstream + +function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} + timings = Timings(times) + lastSum::T = 0 + for i = 1:times + @inbounds timings.copy[i] = @elapsed copy!(data, context) + @inbounds timings.mul[i] = @elapsed mul!(data, context) + @inbounds timings.add[i] = @elapsed add!(data, context) + @inbounds timings.triad[i] = @elapsed triad!(data, context) + @inbounds timings.dot[i] = @elapsed lastSum = dot(data, context) + end + return (timings, lastSum) +end + +function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {T,C} + return @elapsed for _ = 1:times + triad!(data, context) + end +end + +function run_nstream!( + data::StreamData{T,C}, + context, + times::Int, +)::Vector{Float64} where {T,C} + timings::Vector{Float64} = zeros(times) + for i = 1:times + @inbounds timings[i] = @elapsed nstream!(data, context) + end + return timings +end + +function check_solutions( + data::StreamData{T,C}, + times::Int, + init::Tuple{T,T,T}, + benchmark::Benchmark, + dot::Union{T,Nothing}, +) where {T,C} + (gold_a, gold_b, gold_c) = init + for _ = 1:times + if benchmark == All + gold_c = gold_a + gold_b = data.scalar * gold_c + gold_c = gold_a + gold_b + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Triad + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Nstream + gold_a += gold_b + data.scalar * gold_c + else + error("Unknown benchmark", benchmark) + end + end + + tolerance = eps(T) * 100 + function validate_xs(name::String, xs::AbstractArray{T}, from::T) + error = (map(x -> abs(x - from), xs) |> sum) / length(xs) + failed = error > tolerance + if failed + println("Validation failed on $name. Average error $error") + end + !failed + end + a_valid = validate_xs("a", data.a, gold_a) + b_valid = validate_xs("b", data.b, gold_b) + c_valid = validate_xs("c", data.c, gold_c) + dot_valid = + dot !== nothing ? + begin + gold_sum = gold_a * gold_b * data.size + error = abs((dot - gold_sum) / gold_sum) + failed = error > 1.0e-8 + if failed + println( + "Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum", + ) + end + !failed + end : true + + a_valid && b_valid && c_valid && dot_valid +end + +@with_kw mutable struct Config + list::Bool = false + device::Int = 1 + numtimes::Int = 100 + arraysize::Int = 33554432 + float::Bool = false + triad_only::Bool = false + nstream_only::Bool = false + csv::Bool = false + mibibytes::Bool = false +end + +function parse_options(given::Config) + s = ArgParseSettings() + @add_arg_table s begin + "--list" + help = "List available devices" + action = :store_true + "--device", "-d" + help = "Select device at DEVICE, NOTE: Julia is 1-indexed" + arg_type = Int + default = given.device + "--numtimes", "-n" + help = "Run the test NUMTIMES times (NUM >= 2)" + arg_type = Int + default = given.numtimes + "--arraysize", "-s" + help = "Use ARRAYSIZE elements in the array" + arg_type = Int + default = given.arraysize + "--float" + help = "Use floats (rather than doubles)" + action = :store_true + "--triad_only" + help = "Only run triad" + action = :store_true + "--nstream_only" + help = "Only run nstream" + action = :store_true + "--csv" + help = "Output as csv table" + action = :store_true + "--mibibytes" + help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + action = :store_true + end + args = parse_args(s) + # surely there's a better way than doing this: + for (arg, val) in args + setproperty!(given, Symbol(arg), val) + end +end + +const DefaultInit = (0.1, 0.2, 0.0) +const DefaultScalar = 0.4 +const Version = "3.4.0" + +function main() + + config::Config = Config() + parse_options(config) + + if config.list + for (i, (_,repr, impl)) in enumerate(devices()) + println("[$i] ($impl) $repr") + end + exit(0) + end + + ds = devices() + # TODO implement substring device match + if config.device < 1 || config.device > length(ds) + error( + "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", + ) + else + device = ds[config.device] + end + + type = config.float ? Float32 : Float64 + + if config.nstream_only && !config.triad_only + benchmark = Nstream + elseif !config.nstream_only && config.triad_only + benchmark = Triad + elseif !config.nstream_only && !config.triad_only + benchmark = All + elseif config.nstream_only && config.triad_only + error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks") + else + error("Invalid config: $(repr(config))") + end + + array_bytes = config.arraysize * sizeof(type) + total_bytes = array_bytes * 3 + (mega_scale, mega_suffix, giga_scale, giga_suffix) = + !config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB") + + if !config.csv + println("""BabelStream + Version: $Version + Implementation: Julia; $(PROGRAM_FILE)""") + println("Running kernels $(config.numtimes) times") + if benchmark == Triad + println("Number of elements: $(config.arraysize)") + end + println("Precision: $(config.float ? "float" : "double")") + r1 = n -> round(n; digits = 1) + println( + "Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)", + ) + println( + "Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)", + ) + end + + function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) + tail = Base.rest(xs) + min = Iterators.minimum(tail) + max = Iterators.maximum(tail) + avg = Iterators.sum(tail) / Iterators.length(tail) + mbps = mega_scale * total_bytes / min + if config.csv + return [ + ("function", name), + ("num_times", config.numtimes), + ("n_elements", config.arraysize), + ("sizeof", total_bytes), + ("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps), + ("min_runtime", min), + ("max_runtime", max), + ("avg_runtime", avg), + ] + else + return [ + ("Function", name), + ("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)), + ("Min (sec)", round(min; digits = 5)), + ("Max", round(max; digits = 5)), + ("Average", round(avg; digits = 5)), + ] + end + end + + function tabulate(rows::Vector{Tuple{String,Any}}...) + header = Base.first(rows) + padding = config.csv ? 0 : 12 + sep = config.csv ? "," : "" + map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println + for row in rows + map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println + end + end + + init::Tuple{type,type,type} = DefaultInit + scalar::type = DefaultScalar + + GC.enable(false) + + (data, context) = make_stream(config.arraysize, scalar, device, config.csv) + init_arrays!(data, context, init) + if benchmark == All + (timings, sum) = run_all!(data, context, config.numtimes) + valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) + tabulate( + mk_row(timings.copy, "Copy", 2 * array_bytes), + mk_row(timings.mul, "Mul", 2 * array_bytes), + mk_row(timings.add, "Add", 3 * array_bytes), + mk_row(timings.triad, "Triad", 3 * array_bytes), + mk_row(timings.dot, "Dot", 2 * array_bytes), + ) + elseif benchmark == Nstream + timings = run_nstream!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) + elseif benchmark == Triad + elapsed = run_triad!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + total_bytes = 3 * array_bytes * config.numtimes + bandwidth = mega_scale * (total_bytes / elapsed) + println("Runtime (seconds): $(round(elapsed; digits=5))") + println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ") + else + error("Bad benchmark $(benchmark)") + end + + GC.enable(true) + + if !valid + exit(1) + end + +end diff --git a/JuliaStream.jl/src/StreamData.jl b/JuliaStream.jl/src/StreamData.jl new file mode 100644 index 0000000..55e055a --- /dev/null +++ b/JuliaStream.jl/src/StreamData.jl @@ -0,0 +1,7 @@ +struct StreamData{T,C<:AbstractArray{T}} + a::C + b::C + c::C + scalar::T + size::Int +end diff --git a/JuliaStream.jl/src/ThreadedStream.jl b/JuliaStream.jl/src/ThreadedStream.jl new file mode 100644 index 0000000..f282fda --- /dev/null +++ b/JuliaStream.jl/src/ThreadedStream.jl @@ -0,0 +1,112 @@ +include("Stream.jl") + +function devices()::Vector{DeviceWithRepr} + return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{VectorData{T},Nothing} where {T} + if !silent + println("Using max $(Threads.nthreads()) threads") + end + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) +end + +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc +function static_par_ranged(f::Function, range::Int, n::Int) + stride = range ÷ n + rem = range % n + strides = map(0:n) do i + width = stride + (i < rem ? 1 : 0) + offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem)) + (offset, width) + end + ccall(:jl_enter_threaded_region, Cvoid, ()) + try + foreach(wait, map(1:n) do group + (offset, size) = strides[group] + task = Task(() -> f(group, offset+1, offset+size)) + task.sticky = true + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group + schedule(task) + end) + finally + ccall(:jl_exit_threaded_region, Cvoid, ()) + end +end + +function dot(data::VectorData{T}, _) where {T} + partial = Vector{T}(undef, Threads.nthreads()) + static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx + acc = zero(T) + @simd for i = startidx:endidx + @inbounds acc += data.a[i] * data.b[i] + end + @inbounds partial[group] = acc + end + return sum(partial) + # This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall + # and inhibited vectorisation from the lack of @simd + # partial = zeros(T, Threads.nthreads()) + # Threads.@threads for i = 1:data.size + # @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i]) + # end + # return sum(partial) +end + +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/src/oneAPIStream.jl b/JuliaStream.jl/src/oneAPIStream.jl new file mode 100644 index 0000000..83f100e --- /dev/null +++ b/JuliaStream.jl/src/oneAPIStream.jl @@ -0,0 +1,170 @@ +using Base.Iterators: println +using Base.Iterators: println +using Printf: Iterators + +include("Stream.jl") +using oneAPI + +const oneData = StreamData{T,oneArray{T}} where {T} +const DotWGSize = 256::Int + +function devices()::Vector{DeviceWithRepr} + all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect + map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{oneData{T},Int} where {T} + + oneAPI.allowscalar(false) + oneAPI.device!(device[1]) + + props = oneL0.compute_properties(oneAPI.device()) + groupsize = min(props.maxTotalGroupSize, arraysize) + + if arraysize % groupsize != 0 + error("arraysize ($(arraysize)) must be divisible by $(groupsize)!") + end + + if !silent + println("Using L0 device: $(repr("text/plain",device[1]))") + println("Kernel parameters : <<<$(arraysize),$(groupsize)>>>") + end + return ( + oneData{T}( + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + scalar, + arraysize, + ), + groupsize, + ) +end + +function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T} + oneAPI.fill!(data.a, init[1]) + oneAPI.fill!(data.b, init[2]) + oneAPI.fill!(data.c, init[3]) +end + +function copy!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = get_global_id() + @inbounds c[i] = a[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.c, + ) + oneAPI.synchronize() +end + +function mul!(data::oneData{T}, groupsize::Int) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds b[i] = scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function add!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = get_global_id() + @inbounds c[i] = a[i] + b[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + ) + oneAPI.synchronize() +end + +function triad!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function nstream!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function dot(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + wg_sum = @LocalMemory(T, (DotWGSize,)) + li = get_local_id() + @inbounds wg_sum[li] = 0.0 + + # do dot first + i = get_global_id() + while i <= size + @inbounds wg_sum[li] += a[i] * b[i] + i += get_global_size() + end + + # then tree reduction + offset = get_local_size() ÷ 2 + while offset > 0 + barrier() + if li <= offset + @inbounds wg_sum[li] += wg_sum[li+offset] + end + offset ÷= 2 + end + + if li == 1 + @inbounds partial[get_group_id()] = wg_sum[li] + end + + return + end + partial_sum = oneArray{T}(undef, groupsize) + @oneapi items = groupsize groups = DotWGSize kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + oneAPI.synchronize() + return sum(partial_sum) +end + +function read_data(data::oneData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/JuliaStream.jl/update_all.sh b/JuliaStream.jl/update_all.sh new file mode 100755 index 0000000..ad6c2ee --- /dev/null +++ b/JuliaStream.jl/update_all.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# shellcheck disable=SC2034 disable=SC2153 + +for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" +do + julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' +done \ No newline at end of file diff --git a/README.md b/README.md index 3eaf9a5..598b4cd 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,12 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - TBB This code was previously called GPU-STREAM. +This project also contains implementations in alternative languages with different build systems: +* Scala - [scala-stream](./scala-stream) How is this different to STREAM? -------------------------------- @@ -92,7 +95,7 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja +-- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja;tbb -- Selected model : ocl -- Supported flags: @@ -160,10 +163,12 @@ Citing Please cite BabelStream via this reference: -> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. +> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 **Other BabelStream publications:** +> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 + > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). @@ -172,8 +177,6 @@ You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/s > Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017. -> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue (in press). 2017. - > Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States. You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html) diff --git a/TBB.cmake b/TBB.cmake new file mode 100644 index 0000000..e4d6bac --- /dev/null +++ b/TBB.cmake @@ -0,0 +1,29 @@ + +register_flag_optional(ONE_TBB_DIR + "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + "") + + +register_flag_optional(PARTITIONER + "Partitioner specifies how a loop template should partition its work among threads. + Possible values are: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." + "AUTO") + +macro(setup) + if(ONE_TBB_DIR) + set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 + # docs on Intel's website refers to TBB_DIR which is not correct + endif() + + + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages + find_package(TBB REQUIRED) + register_link_library(TBB::tbb) + register_definitions(PARTITIONER_${PARTITIONER}) +endmacro() diff --git a/TBB.make b/TBB.make new file mode 100644 index 0000000..c224a5a --- /dev/null +++ b/TBB.make @@ -0,0 +1,56 @@ + +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU INTEL INTEL_LEGACY + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + + +CXX_GNU = g++ +CXX_INTEL = icpx +CXX_INTEL_LEGACY = icpc +CXX = $(COMPILER_$(COMPILER)) + +CXXFLAGS_GNU = -march=native +CXXFLAGS_INTEL = -march=native +CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) + + + +ifndef PARTITIONER +define partitioner_help +Set PARTITIONER to select TBB's partitioner. +Partitioner specifies how a loop template should partition its work among threads. + +Available options: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + +See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners +for more details. + +endef +$(info $(partitioner_help)) +PARTITIONER=AUTO +endif + +PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) + + +tbb-stream: main.cpp TBBStream.cpp + $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + +.PHONY: clean +clean: + rm -f tbb-stream + diff --git a/TBBStream.cpp b/TBBStream.cpp new file mode 100644 index 0000000..9c34a50 --- /dev/null +++ b/TBBStream.cpp @@ -0,0 +1,134 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "TBBStream.hpp" + +template +TBBStream::TBBStream(const int ARRAY_SIZE, int device) + : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +{ + if(device != 0){ + throw std::runtime_error("Device != 0 is not supported by TBB"); + } + std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; +} + + +template +void TBBStream::init_arrays(T initA, T initB, T initC) +{ + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + }, partitioner); + +} + +template +void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + // Element-wise copy. + h_a = a; + h_b = b; + h_c = c; +} + +template +void TBBStream::copy() +{ + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i]; + } + }, partitioner); +} + +template +void TBBStream::mul() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + b[i] = scalar * c[i]; + } + }, partitioner); + +} + +template +void TBBStream::add() +{ + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i] + b[i]; + } + }, partitioner); + +} + +template +void TBBStream::triad() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = b[i] + scalar * c[i]; + } + }, partitioner); + +} + +template +void TBBStream::nstream() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] += b[i] + scalar * c[i]; + } + }, partitioner); + +} + +template +T TBBStream::dot() +{ + // sum += a[i] * b[i]; + return + tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc += a[i] * b[i]; + } + return acc; + }, std::plus(), partitioner); +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by TBB" << std::endl; +} + +std::string getDeviceName(const int device) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class TBBStream; +template class TBBStream; + diff --git a/TBBStream.hpp b/TBBStream.hpp new file mode 100644 index 0000000..90763a9 --- /dev/null +++ b/TBBStream.hpp @@ -0,0 +1,62 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "tbb/tbb.h" +#include "Stream.h" + +#define IMPLEMENTATION_STRING "TBB" + +#if defined(PARTITIONER_AUTO) +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#elif defined(PARTITIONER_AFFINITY) +using tbb_partitioner = tbb::affinity_partitioner; +#define PARTITIONER_NAME "affinity_partitioner" +#elif defined(PARTITIONER_STATIC) +using tbb_partitioner = tbb::static_partitioner; +#define PARTITIONER_NAME "static_partitioner" +#elif defined(PARTITIONER_SIMPLE) +using tbb_partitioner = tbb::simple_partitioner; +#define PARTITIONER_NAME "simple_partitioner" +#else +// default to auto +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#endif + + +template +class TBBStream : public Stream +{ + protected: + + tbb_partitioner partitioner; + tbb::blocked_range range; + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + public: + TBBStream(const int, int); + ~TBBStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/THRUST.cmake b/THRUST.cmake new file mode 100644 index 0000000..0c286c2 --- /dev/null +++ b/THRUST.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(THRUST_IMPL + "Which Thrust implementation to use, supported options include: + - CUDA (via https://github.com/NVIDIA/thrust) + - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust) + " + "CUDA") + +register_flag_optional(SDK_DIR + "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)" + "") + +register_flag_optional(BACKEND + "[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends: + - CUDA + - OMP + - TBB + " + "CUDA") + +register_flag_optional(CMAKE_CUDA_COMPILER + "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" + "") + +# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly +register_flag_optional(CUDA_ARCH + "[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc" + "") + +register_flag_optional(CUDA_EXTRA_FLAGS + "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" + "") + + +macro(setup) + set(CMAKE_CXX_STANDARD 14) + + if (${THRUST_IMPL} STREQUAL "CUDA") + + # see CUDA.cmake, we're only adding a few Thrust related libraries here + + if (POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif () + + # add -forward-unknown-to-host-compiler for compatibility reasons + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + enable_language(CUDA) + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG + # appended later + wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) + + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + + + if (SDK_DIR) + find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) + find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) + else () + find_package(CUB REQUIRED CONFIG) + find_package(Thrust REQUIRED CONFIG) + endif () + + message(STATUS "Using Thrust backend: ${BACKEND}") + + # this creates the interface that we can link to + thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + + register_link_library(Thrust) + elseif (${THRUST_IMPL} STREQUAL "ROCM") + if (SDK_DIR) + find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) + find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust) + else () + find_package(rocprim REQUIRED CONFIG) + find_package(rocthrust REQUIRED CONFIG) + endif () + + # for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them + set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX) + + register_link_library(roc::rocthrust) + else () + message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}") + endif () + + +endmacro() + + + \ No newline at end of file diff --git a/ThrustStream.cu b/ThrustStream.cu new file mode 100644 index 0000000..3a57ab0 --- /dev/null +++ b/ThrustStream.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "ThrustStream.h" +#include +#include +#include +#include + +static inline void synchronise() +{ +// rocThrust doesn't synchronise between thrust calls +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + hipDeviceSynchronize(); +#endif +} + +template +ThrustStream::ThrustStream(const int ARRAY_SIZE, int device) + : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { + std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device) << std::endl; + std::cout << "Thrust version: " << THRUST_VERSION << std::endl; + +#if THRUST_DEVICE_SYSTEM == 0 + // as per Thrust docs, 0 is reserved for undefined backend + std::cout << "Thrust backend: undefined" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + std::cout << "Thrust backend: CUDA" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP + std::cout << "Thrust backend: OMP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP + std::cout << "Thrust backend: CPP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#else + +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + std::cout << "Thrust backend: HIP" << std::endl; +#else + std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl; +#endif + +#endif + +} + +template +void ThrustStream::init_arrays(T initA, T initB, T initC) +{ + thrust::fill(a.begin(), a.end(), initA); + thrust::fill(b.begin(), b.end(), initB); + thrust::fill(c.begin(), c.end(), initC); + synchronise(); +} + +template +void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + thrust::copy(a.begin(), a.end(), h_a.begin()); + thrust::copy(b.begin(), b.end(), h_b.begin()); + thrust::copy(c.begin(), c.end(), h_c.begin()); +} + +template +void ThrustStream::copy() +{ + thrust::copy(a.begin(), a.end(),c.begin()); + synchronise(); +} + +template +void ThrustStream::mul() +{ + const T scalar = startScalar; + thrust::transform( + c.begin(), + c.end(), + b.begin(), + [=] __device__ __host__ (const T &ci){ + return ci * scalar; + } + ); + synchronise(); +} + +template +void ThrustStream::add() +{ + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())), + c.begin(), + thrust::make_zip_function( + [] __device__ __host__ (const T& ai, const T& bi){ + return ai + bi; + }) + ); + synchronise(); +} + +template +void ThrustStream::triad() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& bi, const T& ci){ + return bi + scalar * ci; + }) + ); + synchronise(); +} + +template +void ThrustStream::nstream() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ + return ai + bi + scalar * ci; + }) + ); + synchronise(); +} + +template +T ThrustStream::dot() +{ + return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); +} + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ + (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) + +#ifdef __NVCC__ +#define IMPL_FN__(fn) cuda ## fn +#define IMPL_TYPE__(tpe) cuda ## tpe +#elif defined(__HIP_PLATFORM_HCC__) +#define IMPL_FN__(fn) hip ## fn +#define IMPL_TYPE__(tpe) hip ## tpe ## _t +#else +# error Unsupported compiler for Thrust +#endif + +void check_error(void) +{ + IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); + if (err != IMPL_FN__(Success)) + { + std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl; + exit(err); + } +} + +void listDevices(void) +{ + // Get number of devices + int count; + IMPL_FN__(GetDeviceCount(&count)); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + IMPL_TYPE__(DeviceProp) props = {}; + IMPL_FN__(GetDeviceProperties(&props, device)); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + IMPL_FN__(SetDevice(device)); + check_error(); + int driver; + IMPL_FN__(DriverGetVersion(&driver)); + check_error(); + return std::to_string(driver); +} + +#undef IMPL_FN__ +#undef IMPL_TPE__ + +#else + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("(device name unavailable)"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("(device driver unavailable)"); +} + +#endif + +template class ThrustStream; +template class ThrustStream; + diff --git a/ThrustStream.h b/ThrustStream.h new file mode 100644 index 0000000..f87ace7 --- /dev/null +++ b/ThrustStream.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Thrust" + +template +class ThrustStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + thrust::device_vector a; + thrust::device_vector b; + thrust::device_vector c; + + public: + ThrustStream(const int, int); + ~ThrustStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/java-stream/.gitignore b/java-stream/.gitignore new file mode 100644 index 0000000..2ed994a --- /dev/null +++ b/java-stream/.gitignore @@ -0,0 +1,128 @@ +## File-based project format: +.idea +*.iws +*.iml + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk +### Maven template +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties + +# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) +!/.mvn/wrapper/maven-wrapper.jar +### Java template +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* +### macOS template +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +!.mvn/**/* + +settings.xml diff --git a/java-stream/.mvn/wrapper/maven-wrapper.jar b/java-stream/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..9cc84ea Binary files /dev/null and b/java-stream/.mvn/wrapper/maven-wrapper.jar differ diff --git a/java-stream/.mvn/wrapper/maven-wrapper.properties b/java-stream/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..56bb016 --- /dev/null +++ b/java-stream/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1 @@ +distributionUrl=https://repo1.maven.org/maven2/org/apache/maven/apache-maven/3.5.0/apache-maven-3.5.0-bin.zip \ No newline at end of file diff --git a/java-stream/README.md b/java-stream/README.md new file mode 100644 index 0000000..6c233da --- /dev/null +++ b/java-stream/README.md @@ -0,0 +1,172 @@ +java-stream +=========== + +This is an implementation of BabelStream in Java 8 which contains the following implementations: + +* `jdk-plain` - Single threaded `for` +* `jdk-stream` - Threaded implementation using JDK8's parallel stream API +* `tornadovm` - A [TornadoVM](https://github.com/beehive-lab/TornadoVM) implementation for + PTX/OpenCL +* `aparapi` - A [Aparapi](https://git.qoto.org/aparapi/aparapi) implementation for OpenCL + +### Build & Run + +Prerequisites + +* JDK >= 8 + +To run the benchmark, first create a binary: + +```shell +> cd java-stream +> ./mvnw clean package +``` + +The binary will be located at `./target/java-stream.jar`. Run it with: + +```shell +> java -version  ✔  11.0.11+9 ☕  tom@soraws-uk  05:03:20 +openjdk version "11.0.11" 2021-04-20 +OpenJDK Runtime Environment GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05) +OpenJDK 64-Bit Server VM GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05, mixed mode) +> java -jar target/java-stream.jar --help +``` + +For best results, benchmark with the following JVM flags: + +``` +-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance +-XX:-TieredCompilation # disable C1, go straight to C2 +-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point +``` + +Worked example: + +```shell +> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/java-stream.jar +BabelStream +Version: 3.4 +Implementation: jdk-stream; (Java 11.0.11;Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-4.fc33.x86_64) +Running all 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 17145.538 0.03131 0.04779 0.03413 +Mul 16759.092 0.03203 0.04752 0.03579 +Add 19431.954 0.04144 0.05866 0.04503 +Triad 19763.970 0.04075 0.05388 0.04510 +Dot 26646.894 0.02015 0.03013 0.02259 +``` + +If your OpenCL/CUDA installation is not at the default location, TornadoVM and Aparapi may fail to +detect your devices. In those cases, you may specify the library directly, for example: + +```shell +> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 java -jar target/java-stream.jar ... +``` + +### Instructions for TornadoVM + +The TornadoVM implementation requires you to run the binary with a patched JVM. Follow the +official [instructions](https://github.com/beehive-lab/TornadoVM/blob/master/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md) +or use the following simplified instructions: + +Prerequisites + +* CMake >= 3.6 +* GCC or clang/LLVM (GCC >= 5.5) +* Python >= 2.7 +* Maven >= 3.6.3 +* OpenCL headers >= 1.2 and/or CUDA SDK >= 9.0 + +First, get a copy of the TornadoVM source: + +```shell +> cd +> git clone https://github.com/beehive-lab/TornadoVM tornadovm +``` + +Take note of the required GraalVM version +in `tornadovm/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md`. We'll use `21.1.0` in this example. +Now, obtain a copy of GraalVM and make sure the version matches the one required by TornadoVM: + +```shell +> wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-21.1.0/graalvm-ce-java11-linux-amd64-21.1.0.tar.gz +> tar -xf graalvm-ce-java11-linux-amd64-21.1.0.tar.gz +``` + +Next, create `~/tornadovm/etc/sources.env` and populate the file with the following: + +```shell +#!/bin/bash +export JAVA_HOME= +export PATH=$PWD/bin/bin:$PATH +export TORNADO_SDK=$PWD/bin/sdk +export CMAKE_ROOT=/usr # path to CMake binary +``` + +Proceed to compile TornadoVM: + +```shell +> cd ~/tornadovm +> . etc/sources.env +> make graal-jdk-11-plus BACKEND={ptx,opencl} +``` + +To test your build, source the environment file: + +```shell +> source ~/tornadovm/etc/sources.env +> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 tornado --devices +Number of Tornado drivers: 1 +Total number of OpenCL devices : 3 +Tornado device=0:0 + AMD Accelerated Parallel Processing -- gfx1012 + Global Memory Size: 4.0 GB + Local Memory Size: 64.0 KB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [1024, 1024, 1024] + Device OpenCL C version: OpenCL C 2.0 + +Tornado device=0:1 + Portable Computing Language -- pthread-AMD Ryzen 9 3900X 12-Core Processor + Global Memory Size: 60.7 GB + Local Memory Size: 8.0 MB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [4096, 4096, 4096] + Device OpenCL C version: OpenCL C 1.2 pocl + +Tornado device=0:2 + NVIDIA CUDA -- NVIDIA GeForce GT 710 + Global Memory Size: 981.3 MB + Local Memory Size: 48.0 KB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [1024, 1024, 64] + Device OpenCL C version: OpenCL C 1.2 +``` + +You can now use TornadoVM to run java-stream: + +```shell +> tornado -jar ~/java-stream/target/java-stream.jar --impl tornadovm --arraysize 65536  1 ✘  11.0.11+9 ☕  tom@soraws-uk  05:31:34 +BabelStream +Version: 3.4 +Implementation: tornadovm; (Java 11.0.11;GraalVM Community; home=~/graalvm-ce-java11-21.1.0) +Running all 100 times +Precision: double +Array size: 0.5 MB (=0.0 GB) +Total size: 1.6 MB (=0.0 GB) +Using TornadoVM device: + - Name : NVIDIA GeForce GT 710 CL_DEVICE_TYPE_GPU (available) + - Id : opencl-0-0 + - Platform : NVIDIA CUDA + - Backend : OpenCL +Function MBytes/sec Min (sec) Max Average +Copy 8791.100 0.00012 0.00079 0.00015 +Mul 8774.107 0.00012 0.00061 0.00014 +Add 9903.313 0.00016 0.00030 0.00018 +Triad 9861.031 0.00016 0.00030 0.00018 +Dot 2799.465 0.00037 0.00056 0.00041 +``` + diff --git a/java-stream/mvnw b/java-stream/mvnw new file mode 100755 index 0000000..5bf251c --- /dev/null +++ b/java-stream/mvnw @@ -0,0 +1,225 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven2 Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Migwn, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" + # TODO classpath? +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +echo $MAVEN_PROJECTBASEDIR +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/java-stream/mvnw.cmd b/java-stream/mvnw.cmd new file mode 100644 index 0000000..019bd74 --- /dev/null +++ b/java-stream/mvnw.cmd @@ -0,0 +1,143 @@ +@REM ---------------------------------------------------------------------------- +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM ---------------------------------------------------------------------------- + +@REM ---------------------------------------------------------------------------- +@REM Maven2 Start Up Batch script +@REM +@REM Required ENV vars: +@REM JAVA_HOME - location of a JDK home dir +@REM +@REM Optional ENV vars +@REM M2_HOME - location of maven2's installed home dir +@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands +@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending +@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven +@REM e.g. to debug Maven itself, use +@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files +@REM ---------------------------------------------------------------------------- + +@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' +@echo off +@REM enable echoing my setting MAVEN_BATCH_ECHO to 'on' +@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% + +@REM set %HOME% to equivalent of $HOME +if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") + +@REM Execute a user defined script before this one +if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre +@REM check for pre script, once with legacy .bat ending and once with .cmd ending +if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" +if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" +:skipRcPre + +@setlocal + +set ERROR_CODE=0 + +@REM To isolate internal variables from possible post scripts, we use another setlocal +@setlocal + +@REM ==== START VALIDATION ==== +if not "%JAVA_HOME%" == "" goto OkJHome + +echo. +echo Error: JAVA_HOME not found in your environment. >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +:OkJHome +if exist "%JAVA_HOME%\bin\java.exe" goto init + +echo. +echo Error: JAVA_HOME is set to an invalid directory. >&2 +echo JAVA_HOME = "%JAVA_HOME%" >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +@REM ==== END VALIDATION ==== + +:init + +@REM Find the project base dir, i.e. the directory that contains the folder ".mvn". +@REM Fallback to current working directory if not found. + +set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% +IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir + +set EXEC_DIR=%CD% +set WDIR=%EXEC_DIR% +:findBaseDir +IF EXIST "%WDIR%"\.mvn goto baseDirFound +cd .. +IF "%WDIR%"=="%CD%" goto baseDirNotFound +set WDIR=%CD% +goto findBaseDir + +:baseDirFound +set MAVEN_PROJECTBASEDIR=%WDIR% +cd "%EXEC_DIR%" +goto endDetectBaseDir + +:baseDirNotFound +set MAVEN_PROJECTBASEDIR=%EXEC_DIR% +cd "%EXEC_DIR%" + +:endDetectBaseDir + +IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig + +@setlocal EnableExtensions EnableDelayedExpansion +for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a +@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% + +:endReadAdditionalConfig + +SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" + +set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" +set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +%MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* +if ERRORLEVEL 1 goto error +goto end + +:error +set ERROR_CODE=1 + +:end +@endlocal & set ERROR_CODE=%ERROR_CODE% + +if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost +@REM check for post script, once with legacy .bat ending and once with .cmd ending +if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" +if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" +:skipRcPost + +@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' +if "%MAVEN_BATCH_PAUSE%" == "on" pause + +if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% + +exit /B %ERROR_CODE% diff --git a/java-stream/pom.xml b/java-stream/pom.xml new file mode 100644 index 0000000..ffaee72 --- /dev/null +++ b/java-stream/pom.xml @@ -0,0 +1,133 @@ + + + + 4.0.0 + + java-stream + javastream + 3.4.0 + + + UTF-8 + UTF-8 + 5.7.2 + + + + + universityOfManchester-graal + https://raw.githubusercontent.com/beehive-lab/tornado/maven-tornadovm + + + + + + + com.beust + jcommander + 1.81 + + + + tornado + tornado-api + 0.9 + + + + com.aparapi + aparapi + 2.0.0 + + + + org.scala-lang + scala-library + + + + + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + test + + + org.junit.jupiter + junit-jupiter-params + ${junit.version} + test + + + + + + + + maven-compiler-plugin + 3.8.1 + + 1.8 + 1.8 + -Xlint:all + true + true + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M5 + + + + + maven-shade-plugin + 3.2.4 + + + package + + shade + + + + + javastream.Main + + + + + *:* + + META-INF/*.MF + + + + ${project.artifactId} + + + + + + + com.coveo + fmt-maven-plugin + 2.9.1 + + + + format + + + + + + + + + \ No newline at end of file diff --git a/java-stream/src/main/java/javastream/FractionalMaths.java b/java-stream/src/main/java/javastream/FractionalMaths.java new file mode 100644 index 0000000..982a28a --- /dev/null +++ b/java-stream/src/main/java/javastream/FractionalMaths.java @@ -0,0 +1,45 @@ +package javastream; + +/** + * This class represents our Fractional typeclass. Java's type system isn't unified so we have to do + * insane things for parametric operations on fractional types. + */ +@SuppressWarnings("unchecked") +public final class FractionalMaths { + + private FractionalMaths() { + throw new AssertionError(); + } + + public static T from(Class evidence, Number n) { + if (evidence == Double.TYPE || evidence == Double.class) + return (T) Double.valueOf(n.doubleValue()); + else if (evidence == Float.TYPE || evidence == Float.class) + return (T) Float.valueOf(n.floatValue()); + throw new IllegalArgumentException(); + } + + public static T plus(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() + y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() + y.floatValue()); + throw new IllegalArgumentException(); + } + + static T minus(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() - y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() - y.floatValue()); + throw new IllegalArgumentException(); + } + + public static T times(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() * y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() * y.floatValue()); + throw new IllegalArgumentException(); + } + + static T divide(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() / y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() / y.floatValue()); + throw new IllegalArgumentException(); + } +} diff --git a/java-stream/src/main/java/javastream/JavaStream.java b/java-stream/src/main/java/javastream/JavaStream.java new file mode 100644 index 0000000..7ab96cb --- /dev/null +++ b/java-stream/src/main/java/javastream/JavaStream.java @@ -0,0 +1,172 @@ +package javastream; + +import java.time.Duration; +import java.util.AbstractMap; +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javastream.Main.Config; + +public abstract class JavaStream { + + public static final class Data { + final T[] a, b, c; + + public Data(T[] a, T[] b, T[] c) { + this.a = Objects.requireNonNull(a); + this.b = Objects.requireNonNull(b); + this.c = Objects.requireNonNull(c); + } + } + + static final class Timings { + final List copy = new ArrayList<>(); + final List mul = new ArrayList<>(); + final List add = new ArrayList<>(); + final List triad = new ArrayList<>(); + final List dot = new ArrayList<>(); + } + + protected final Config config; + + protected JavaStream(Config config) { + this.config = config; + } + + protected abstract List listDevices(); + + protected abstract void initArrays(); + + protected abstract void copy(); + + protected abstract void mul(); + + protected abstract void add(); + + protected abstract void triad(); + + protected abstract void nstream(); + + protected abstract T dot(); + + protected abstract Data data(); + + public static class EnumeratedStream extends JavaStream { + + protected final JavaStream actual; + private final Entry, JavaStream>>[] options; + + @SafeVarargs + @SuppressWarnings("varargs") + public EnumeratedStream( + Config config, Entry, JavaStream>>... options) { + super(config); + this.actual = options[config.options.device].getValue().apply(config); + this.options = options; + } + + @Override + protected List listDevices() { + return Arrays.stream(options).map(Entry::getKey).collect(Collectors.toList()); + } + + @Override + public void initArrays() { + actual.initArrays(); + } + + @Override + public void copy() { + actual.copy(); + } + + @Override + public void mul() { + actual.mul(); + } + + @Override + public void add() { + actual.add(); + } + + @Override + public void triad() { + actual.triad(); + } + + @Override + public void nstream() { + actual.nstream(); + } + + @Override + public T dot() { + return actual.dot(); + } + + @Override + public Data data() { + return actual.data(); + } + } + + public static Double[] boxed(double[] xs) { + return Arrays.stream(xs).boxed().toArray(Double[]::new); + } + + public static Float[] boxed(float[] xs) { + return IntStream.range(0, xs.length).mapToObj(i -> xs[i]).toArray(Float[]::new); + } + + private static AbstractMap.SimpleImmutableEntry timed(Supplier f) { + long start = System.nanoTime(); + T r = f.get(); + long end = System.nanoTime(); + return new AbstractMap.SimpleImmutableEntry<>(Duration.ofNanos(end - start), r); + } + + private static Duration timed(Runnable f) { + long start = System.nanoTime(); + f.run(); + long end = System.nanoTime(); + return Duration.ofNanos(end - start); + } + + final SimpleImmutableEntry, T> runAll(int times) { + Timings timings = new Timings<>(); + T lastSum = null; + for (int i = 0; i < times; i++) { + timings.copy.add(timed(this::copy)); + timings.mul.add(timed(this::mul)); + timings.add.add(timed(this::add)); + timings.triad.add(timed(this::triad)); + SimpleImmutableEntry dot = timed(this::dot); + timings.dot.add(dot.getKey()); + lastSum = dot.getValue(); + } + return new SimpleImmutableEntry<>(timings, lastSum); + } + + final Duration runTriad(int times) { + return timed( + () -> { + for (int i = 0; i < times; i++) { + triad(); + } + }); + } + + final List runNStream(int times) { + return IntStream.range(0, times) + .mapToObj(i -> timed(this::nstream)) + .collect(Collectors.toList()); + } +} diff --git a/java-stream/src/main/java/javastream/Main.java b/java-stream/src/main/java/javastream/Main.java new file mode 100644 index 0000000..32b67a4 --- /dev/null +++ b/java-stream/src/main/java/javastream/Main.java @@ -0,0 +1,425 @@ +package javastream; + +import static javastream.FractionalMaths.divide; +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.minus; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import java.time.Duration; +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.Arrays; +import java.util.DoubleSummaryStatistics; +import java.util.List; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import java.util.stream.Collectors; +import javastream.JavaStream.Data; +import javastream.JavaStream.Timings; +import javastream.aparapi.AparapiStreams; +import javastream.jdk.JdkStreams; +import javastream.jdk.PlainStream; +import javastream.tornadovm.TornadoVMStreams; + +public class Main { + + enum Benchmark { + NSTREAM, + TRIAD, + ALL + } + + public static class Options { + + @Parameter(names = "--list", description = "List available devices for all implementations") + boolean list = false; + + @Parameter( + names = "--device", + description = "Select device at , see --list for options") + public int device = 0; + + @Parameter( + names = "--impl", + description = "Select implementation at , see --list for options") + public String impl = ""; + + @Parameter( + names = {"--numtimes", "-n"}, + description = "Run the test times (NUM >= 2)") + public int numtimes = 100; + + @Parameter( + names = {"--arraysize", "-s"}, + description = "Use elements in the array") + public int arraysize = 33554432; + + @Parameter(names = "--float", description = "Use floats (rather than doubles)") + public boolean useFloat = false; + + @Parameter(names = "--triad-only", description = "Only run triad") + public boolean triadOnly = false; + + @Parameter(names = "--nstream-only", description = "Only run nstream") + public boolean nstreamOnly = false; + + @Parameter(names = "--csv", description = "Output as csv table") + public boolean csv = false; + + @Parameter( + names = "--mibibytes", + description = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)") + public boolean mibibytes = false; + + @Parameter(names = "--dot-tolerance", description = "Tolerance for dot kernel verification") + public double dotTolerance = 1.0e-8; + + public boolean isVerboseBenchmark() { + return !list && !csv; + } + } + + public static final class Config { + public final Options options; + public final Benchmark benchmark; + public final int typeSize; + public final Class evidence; + public final T ulp, scalar, initA, initB, initC; + + public Config( + Options options, + Benchmark benchmark, + int typeSize, + Class evidence, + T ulp, + T scalar, + T initA, + T initB, + T initC) { + this.options = Objects.requireNonNull(options); + this.benchmark = Objects.requireNonNull(benchmark); + this.typeSize = typeSize; + this.evidence = Objects.requireNonNull(evidence); + this.ulp = Objects.requireNonNull(ulp); + this.scalar = Objects.requireNonNull(scalar); + this.initA = Objects.requireNonNull(initA); + this.initB = Objects.requireNonNull(initB); + this.initC = Objects.requireNonNull(initC); + } + } + + static final class Implementation { + final String name; + final Function, JavaStream> makeFloat; + final Function, JavaStream> makeDouble; + + Implementation( + String name, + Function, JavaStream> makeFloat, + Function, JavaStream> makeDouble) { + this.name = Objects.requireNonNull(name); + this.makeFloat = Objects.requireNonNull(makeFloat); + this.makeDouble = Objects.requireNonNull(makeDouble); + } + } + + static boolean run( + String name, Config config, Function, JavaStream> mkStream) { + + Options opt = config.options; + + int arrayBytes = opt.arraysize * config.typeSize; + int totalBytes = arrayBytes * 3; + + String megaSuffix = opt.mibibytes ? "MiB" : "MB"; + String gigaSuffix = opt.mibibytes ? "GiB" : "GB"; + + double megaScale = opt.mibibytes ? Math.pow(2.0, -20) : 1.0e-6; + double gigaScale = opt.mibibytes ? Math.pow(2.0, -30) : 1.0e-9; + + if (!opt.csv) { + + String vendor = System.getProperty("java.vendor"); + String ver = System.getProperty("java.version"); + String home = System.getProperty("java.home"); + + System.out.println("BabelStream"); + System.out.printf("Version: %s%n", VERSION); + System.out.printf( + "Implementation: %s (Java %s; %s; JAVA_HOME=%s)%n", name, ver, vendor, home); + final String benchmarkName; + switch (config.benchmark) { + case NSTREAM: + benchmarkName = "nstream"; + break; + case TRIAD: + benchmarkName = "triad"; + break; + case ALL: + benchmarkName = "all"; + break; + default: + throw new AssertionError("Unexpected value: " + config.benchmark); + } + System.out.println("Running " + benchmarkName + " " + opt.numtimes + " times"); + + if (config.benchmark == Benchmark.TRIAD) { + System.out.println("Number of elements: " + opt.arraysize); + } + + System.out.println("Precision: " + (opt.useFloat ? "float" : "double")); + System.out.printf( + "Array size: %.1f %s (=%.1f %s)%n", + (megaScale * arrayBytes), megaSuffix, (gigaScale * arrayBytes), gigaSuffix); + System.out.printf( + "Total size: %.1f %s (=%.1f %s)%n", + (megaScale * totalBytes), megaSuffix, (gigaScale * totalBytes), gigaSuffix); + } + + JavaStream stream = mkStream.apply(config); + + stream.initArrays(); + + final boolean ok; + switch (config.benchmark) { + case ALL: + Entry, T> results = stream.runAll(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.of(results.getValue())); + Timings timings = results.getKey(); + tabulateCsv( + opt.csv, + mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); + break; + case NSTREAM: + List nstreamResults = stream.runNStream(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.empty()); + tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); + break; + case TRIAD: + Duration triadResult = stream.runTriad(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.empty()); + int triadTotalBytes = 3 * arrayBytes * opt.numtimes; + double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); + System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); + System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); + break; + default: + throw new AssertionError(); + } + return ok; + } + + private static boolean checkWithinTolerance( + String name, T[] xs, T gold, T tolerance) { + // it's ok to default to double for error calculation + double error = + Arrays.stream(xs) + .mapToDouble(x -> Math.abs(minus(x, gold).doubleValue())) + .summaryStatistics() + .getAverage(); + boolean failed = error > tolerance.doubleValue(); + if (failed) { + System.err.printf("Validation failed on %s. Average error %s%n", name, error); + } + return !failed; + } + + @SuppressWarnings("OptionalUsedAsFieldOrParameterType") + static boolean checkSolutions( + Data data, Config config, Optional dotSum) { + T goldA = config.initA; + T goldB = config.initB; + T goldC = config.initC; + + for (int i = 0; i < config.options.numtimes; i++) { + switch (config.benchmark) { + case ALL: + goldC = goldA; + goldB = times(config.scalar, goldC); + goldC = plus(goldA, goldB); + goldA = plus(goldB, times(config.scalar, goldC)); + break; + case TRIAD: + goldA = plus(goldB, times(config.scalar, goldC)); + break; + case NSTREAM: + goldA = plus(goldA, plus(goldB, times(config.scalar, goldC))); + break; + } + } + + T tolerance = times(config.ulp, from(config.evidence, 100)); + boolean aValid = checkWithinTolerance("a", data.a, goldA, tolerance); + boolean bValid = checkWithinTolerance("b", data.b, goldB, tolerance); + boolean cValid = checkWithinTolerance("c", data.c, goldC, tolerance); + + final T finalGoldA = goldA; + final T finalGoldB = goldB; + boolean sumValid = + dotSum + .map( + actual -> { + T goldSum = + times( + times(finalGoldA, finalGoldB), + from(config.evidence, config.options.arraysize)); + double error = Math.abs(divide(minus(actual, goldSum), goldSum).doubleValue()); + boolean failed = error > config.options.dotTolerance; + if (failed) { + System.err.printf( + "Validation failed on sum. Error %s \nSum was %s but should be %s%n", + error, actual, goldSum); + } + return !failed; + }) + .orElse(true); + + return aValid && bValid && cValid && sumValid; + } + + private static double durationToSeconds(Duration d) { + return d.toNanos() / (double) TimeUnit.SECONDS.toNanos(1); + } + + private static List> mkCsvRow( + List xs, String name, int totalBytes, double megaScale, Options opt) { + DoubleSummaryStatistics stats = + xs.stream().skip(1).mapToDouble(Main::durationToSeconds).summaryStatistics(); + if (stats.getCount() <= 0) { + throw new IllegalArgumentException("No min/max for " + name + "(size=" + totalBytes + ")"); + } + double mbps = megaScale * (double) totalBytes / stats.getMin(); + return opt.csv + ? Arrays.asList( + new SimpleImmutableEntry<>("function", name), + new SimpleImmutableEntry<>("num_times", opt.numtimes + ""), + new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""), + new SimpleImmutableEntry<>("sizeof", totalBytes + ""), + new SimpleImmutableEntry<>( + "max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", mbps + ""), + new SimpleImmutableEntry<>("min_runtime", stats.getMin() + ""), + new SimpleImmutableEntry<>("max_runtime", stats.getMax() + ""), + new SimpleImmutableEntry<>("avg_runtime", stats.getAverage() + "")) + : Arrays.asList( + new SimpleImmutableEntry<>("Function", name), + new SimpleImmutableEntry<>( + "M" + (opt.mibibytes ? "i" : "") + "Bytes/sec", String.format("%.3f", mbps)), + new SimpleImmutableEntry<>("Min (sec)", String.format("%.5f", stats.getMin())), + new SimpleImmutableEntry<>("Max", String.format("%.5f", stats.getMax())), + new SimpleImmutableEntry<>("Average", String.format("%.5f", stats.getAverage()))); + } + + private static String padSpace(String s, int length) { + if (length == 0) return s; + return String.format("%1$-" + length + "s", s); + } + + @SafeVarargs + @SuppressWarnings("varargs") + private static void tabulateCsv(boolean csv, List>... rows) { + if (rows.length == 0) throw new IllegalArgumentException("Empty tabulation"); + int padding = csv ? 0 : 12; + String sep = csv ? "," : ""; + System.out.println( + rows[0].stream().map(x -> padSpace(x.getKey(), padding)).collect(Collectors.joining(sep))); + for (List> row : rows) { + System.out.println( + row.stream().map(x -> padSpace(x.getValue(), padding)).collect(Collectors.joining(sep))); + } + } + + private static final String VERSION = "3.4"; + + private static final float START_SCALAR = 0.4f; + private static final float START_A = 0.1f; + private static final float START_B = 0.2f; + private static final float START_C = 0.0f; + + private static final List IMPLEMENTATIONS = + Arrays.asList( + new Implementation("jdk-stream", JdkStreams.FLOAT, JdkStreams.DOUBLE), + new Implementation("jdk-plain", PlainStream.FLOAT, PlainStream.DOUBLE), + new Implementation("tornadovm", TornadoVMStreams.FLOAT, TornadoVMStreams.DOUBLE), + new Implementation("aparapi", AparapiStreams.FLOAT, AparapiStreams.DOUBLE)); + + public static int run(String[] args) { + Options opt = new Options(); + JCommander.newBuilder().addObject(opt).build().parse(args); + + final Benchmark benchmark; + if (opt.nstreamOnly && opt.triadOnly) + throw new RuntimeException( + "Both triad and nstream are enabled, pick one or omit both to run all benchmarks"); + else if (opt.nstreamOnly) benchmark = Benchmark.NSTREAM; + else if (opt.triadOnly) benchmark = Benchmark.TRIAD; + else benchmark = Benchmark.ALL; + + final Config floatConfig = + new Config<>( + opt, + benchmark, + Float.BYTES, + Float.class, // XXX not Float.TYPE, we want the boxed one + Math.ulp(1.f), + START_SCALAR, + START_A, + START_B, + START_C); + final Config doubleConfig = + new Config<>( + opt, + benchmark, + Double.BYTES, + Double.class, // XXX not Double.TYPE, we want the boxed one + Math.ulp(1.d), + (double) START_SCALAR, + (double) START_A, + (double) START_B, + (double) START_C); + + if (opt.list) { + System.out.println("Set implementation with --impl and device with --device :"); + for (Implementation entry : IMPLEMENTATIONS) { + System.out.println("Implementation: " + entry.name); + try { + List devices = entry.makeDouble.apply(doubleConfig).listDevices(); + for (int i = 0; i < devices.size(); i++) { + System.out.println("\t[" + i + "] " + devices.get(i)); + } + } catch (Exception e) { + System.out.println("\t(Unsupported: " + e.getMessage() + ")"); + } + } + return 0; + } + + String implName = (opt.impl.isEmpty()) ? IMPLEMENTATIONS.get(0).name : opt.impl; + Implementation impl = + IMPLEMENTATIONS.stream() + .filter(x -> implName.compareToIgnoreCase(x.name) == 0) + .findFirst() + .orElseThrow( + () -> + new IllegalArgumentException("Implementation " + opt.impl + " does not exist")); + + boolean ok = + opt.useFloat + ? run(impl.name, floatConfig, impl.makeFloat) + : run(impl.name, doubleConfig, impl.makeDouble); + + return ok ? 0 : 1; + } + + public static void main(String[] args) { + System.exit(run(args)); + } +} diff --git a/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java b/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java new file mode 100644 index 0000000..ab2de52 --- /dev/null +++ b/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java @@ -0,0 +1,129 @@ +package javastream.aparapi; + +import com.aparapi.device.Device; +import com.aparapi.device.Device.TYPE; +import com.aparapi.device.JavaDevice; +import com.aparapi.device.OpenCLDevice; +import com.aparapi.internal.kernel.KernelManager; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javastream.JavaStream; +import javastream.Main.Config; + +public final class AparapiStreams { + + private AparapiStreams() {} + + public static final Function, JavaStream> DOUBLE = + config -> new Generic<>(config, SpecialisedDoubleKernel::new); + + public static final Function, JavaStream> FLOAT = + config -> new Generic<>(config, SpecialisedFloatKernel::new); + + private static List enumerateDevices() { + + // JavaDevice.SEQUENTIAL doesn't work when arraysize > 1, so we omit it entirely + Stream cpuDevices = Stream.of(JavaDevice.ALTERNATIVE_ALGORITHM); + + Stream clDevices = + Stream.of(TYPE.values()).map(OpenCLDevice::listDevices).flatMap(Collection::stream); + + return Stream.concat(clDevices, cpuDevices).collect(Collectors.toList()); + } + + private static String deviceName(Device device) { + return device.toString(); + } + + private static final class Generic extends JavaStream { + + private final GenericAparapiStreamKernel kernels; + + Generic(Config config, GenericAparapiStreamKernel.Factory factory) { + super(config); + Device device = enumerateDevices().get(config.options.device); + + final int numGroups; + final int workGroupSize; + if (device instanceof JavaDevice) { + numGroups = Runtime.getRuntime().availableProcessors(); + workGroupSize = + config.typeSize * 2; // closest thing to CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE + + } else if (device instanceof OpenCLDevice) { + numGroups = ((OpenCLDevice) device).getMaxComputeUnits(); + workGroupSize = device.getMaxWorkGroupSize(); + } else { + throw new AssertionError("Unknown device type " + device.getClass()); + } + + if (config.options.isVerboseBenchmark()) { + System.out.println("Using Aparapi OpenCL device: " + device); + System.out.println(" - numGroups : " + numGroups); + System.out.println(" - workGroupSize : " + workGroupSize); + String showCL = System.getProperty("com.aparapi.enableShowGeneratedOpenCL"); + if (showCL == null || !showCL.equals("true")) { + System.out.println( + "(Add `-Dcom.aparapi.enableShowGeneratedOpenCL=true` to show generated OpenCL source)"); + } + } + + LinkedHashSet candidate = new LinkedHashSet<>(); + candidate.add(device); + + kernels = factory.create(config, numGroups, workGroupSize); + KernelManager.instance().setPreferredDevices(kernels, candidate); + } + + @Override + public List listDevices() { + return enumerateDevices().stream() + .map(AparapiStreams::deviceName) + .collect(Collectors.toList()); + } + + @Override + public void initArrays() { + kernels.init(); + } + + @Override + public void copy() { + kernels.copy(); + } + + @Override + public void mul() { + kernels.mul(); + } + + @Override + public void add() { + kernels.add(); + } + + @Override + public void triad() { + kernels.triad(); + } + + @Override + public void nstream() { + kernels.nstream(); + } + + @Override + public T dot() { + return kernels.dot(); + } + + @Override + public Data data() { + return kernels.syncAndDispose(); + } + } +} diff --git a/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java b/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java new file mode 100644 index 0000000..526b472 --- /dev/null +++ b/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java @@ -0,0 +1,68 @@ +package javastream.aparapi; + +import com.aparapi.Kernel; +import com.aparapi.Range; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +abstract class GenericAparapiStreamKernel extends Kernel { + + protected static final int FN_COPY = 1; + protected static final int FN_MUL = 2; + protected static final int FN_ADD = 3; + protected static final int FN_TRIAD = 4; + protected static final int FN_NSTREAM = 5; + protected static final int FN_DOT = 6; + protected final Config config; + protected final int arraysize, numGroups, workGroupSize; + + interface Factory { + GenericAparapiStreamKernel create(Config config, int numGroups, int workGroupSize); + } + + GenericAparapiStreamKernel(Config config, int numGroups, int workGroupSize) { + this.config = config; + this.arraysize = config.options.arraysize; + this.numGroups = numGroups; + this.workGroupSize = workGroupSize; + setExplicit(true); + } + + protected int function; + + public abstract void init(); + + public void copy() { + function = FN_COPY; + execute(arraysize); + } + + public void mul() { + function = FN_MUL; + execute(arraysize); + } + + public void add() { + function = FN_ADD; + execute(arraysize); + } + + public void triad() { + function = FN_TRIAD; + execute(arraysize); + } + + public void nstream() { + function = FN_NSTREAM; + execute(arraysize); + } + + protected Kernel partialDot() { + function = FN_DOT; + return execute(Range.create(numGroups * workGroupSize, workGroupSize)); + } + + abstract T dot(); + + abstract Data syncAndDispose(); +} diff --git a/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java b/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java new file mode 100644 index 0000000..56a59af --- /dev/null +++ b/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java @@ -0,0 +1,74 @@ +package javastream.aparapi; + +import java.util.Arrays; +import javastream.JavaStream; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +final class SpecialisedDoubleKernel extends GenericAparapiStreamKernel { + private final double scalar; + final double[] a, b, c; + private final double[] partialSum; + @Local private final double[] workGroupSum; + + SpecialisedDoubleKernel(Config config, int numGroups, int workGroupSize) { + super(config, numGroups, workGroupSize); + this.scalar = config.scalar; + this.a = new double[this.arraysize]; + this.b = new double[this.arraysize]; + this.c = new double[this.arraysize]; + + this.partialSum = new double[numGroups]; + this.workGroupSum = new double[workGroupSize]; + } + + @SuppressWarnings("DuplicatedCode") + @Override + public void run() { + int i = getGlobalId(); + if (function == FN_COPY) { + c[i] = a[i]; + } else if (function == FN_MUL) { + b[i] = scalar * c[i]; + } else if (function == FN_ADD) { + c[i] = a[i] + b[i]; + } else if (function == FN_TRIAD) { + a[i] = b[i] + scalar * c[i]; + } else if (function == FN_NSTREAM) { + a[i] += b[i] + scalar * c[i]; + } else if (function == FN_DOT) { + int localId = getLocalId(0); + workGroupSum[localId] = 0.0; + for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; + for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { + localBarrier(); + if (localId < offset) { + workGroupSum[localId] += workGroupSum[localId + offset]; + } + } + if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; + } + } + + @Override + public void init() { + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + put(a).put(b).put(c); + } + + @Override + public Double dot() { + partialDot().get(partialSum); + double sum = 0; + for (double v : partialSum) sum += v; + return sum; + } + + @Override + public Data syncAndDispose() { + get(a).get(b).get(c).dispose(); + return new Data<>(JavaStream.boxed(a), JavaStream.boxed(b), JavaStream.boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java b/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java new file mode 100644 index 0000000..6919f06 --- /dev/null +++ b/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java @@ -0,0 +1,75 @@ +package javastream.aparapi; + +import static javastream.JavaStream.boxed; + +import java.util.Arrays; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +final class SpecialisedFloatKernel extends GenericAparapiStreamKernel { + private final float scalar; + final float[] a, b, c; + private final float[] partialSum; + @Local private final float[] workGroupSum; + + SpecialisedFloatKernel(Config config, int numGroups, int workGroupSize) { + super(config, numGroups, workGroupSize); + this.scalar = config.scalar; + this.a = new float[this.arraysize]; + this.b = new float[this.arraysize]; + this.c = new float[this.arraysize]; + + this.partialSum = new float[numGroups]; + this.workGroupSum = new float[workGroupSize]; + } + + @SuppressWarnings("DuplicatedCode") + @Override + public void run() { + int i = getGlobalId(); + if (function == FN_COPY) { + c[i] = a[i]; + } else if (function == FN_MUL) { + b[i] = scalar * c[i]; + } else if (function == FN_ADD) { + c[i] = a[i] + b[i]; + } else if (function == FN_TRIAD) { + a[i] = b[i] + scalar * c[i]; + } else if (function == FN_NSTREAM) { + a[i] += b[i] + scalar * c[i]; + } else if (function == FN_DOT) { + int localId = getLocalId(0); + workGroupSum[localId] = 0.f; + for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; + for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { + localBarrier(); + if (localId < offset) { + workGroupSum[localId] += workGroupSum[localId + offset]; + } + } + if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; + } + } + + @Override + public void init() { + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + put(a).put(b).put(c); + } + + @Override + public Float dot() { + partialDot().get(partialSum); + float sum = 0; + for (float v : partialSum) sum += v; + return sum; + } + + @Override + public Data syncAndDispose() { + get(a).get(b).get(c).dispose(); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java b/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java new file mode 100644 index 0000000..7f210fa --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java @@ -0,0 +1,92 @@ +package javastream.jdk; + +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import java.lang.reflect.Array; +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class GenericPlainStream extends JavaStream { + + private final T[] a; + private final T[] b; + private final T[] c; + + @SuppressWarnings("unchecked") + GenericPlainStream(Config config) { + super(config); + this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = times(config.scalar, c[i]); + } + } + + @Override + public void add() { + + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = plus(a[i], b[i]); + } + } + + @Override + public void triad() { + + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = plus(b[i], times(config.scalar, c[i])); + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = plus(a[i], plus(b[i], times(config.scalar, c[i]))); + } + } + + @Override + public T dot() { + T acc = from(config.evidence, 0); + for (int i = 0; i < config.options.arraysize; i++) { + acc = plus(acc, times(a[i], b[i])); + } + return acc; + } + + @Override + public Data data() { + return new Data<>(a, b, c); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/GenericStream.java b/java-stream/src/main/java/javastream/jdk/GenericStream.java new file mode 100644 index 0000000..1e65b8f --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/GenericStream.java @@ -0,0 +1,86 @@ +package javastream.jdk; + +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.FractionalMaths; +import javastream.JavaStream; +import javastream.Main.Config; + +/** + * We use + * + *
Arrays.parallelSetAll
+ * + *

here as it internally calls + * + *

IntStream.range(0, array.length).parallel().forEach(...)
+ */ +final class GenericStream extends JavaStream { + + private final T[] a, b, c; + + @SuppressWarnings("unchecked") + GenericStream(Config config) { + super(config); + this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + Arrays.parallelSetAll(a, i -> config.initA); + Arrays.parallelSetAll(b, i -> config.initB); + Arrays.parallelSetAll(c, i -> config.initC); + } + + @Override + public void copy() { + Arrays.parallelSetAll(c, i -> a[i]); + } + + @Override + public void mul() { + Arrays.parallelSetAll(b, i -> times(config.scalar, c[i])); + } + + @Override + public void add() { + Arrays.parallelSetAll(c, i -> plus(a[i], b[i])); + } + + @Override + public void triad() { + Arrays.parallelSetAll(a, i -> plus(b[i], times(config.scalar, c[i]))); + } + + @Override + public void nstream() { + Arrays.parallelSetAll(a, i -> plus(a[i], plus(b[i], times(config.scalar, c[i])))); + } + + @Override + public T dot() { + return IntStream.range(0, config.options.arraysize) + .parallel() + .mapToObj(i -> times(a[i], b[i])) + .reduce(from(config.evidence, 0), FractionalMaths::plus); + } + + @Override + public Data data() { + return new Data<>(a, b, c); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/JdkStreams.java b/java-stream/src/main/java/javastream/jdk/JdkStreams.java new file mode 100644 index 0000000..5b58be7 --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/JdkStreams.java @@ -0,0 +1,26 @@ +package javastream.jdk; + +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.function.Function; +import javastream.JavaStream; +import javastream.JavaStream.EnumeratedStream; +import javastream.Main.Config; + +public final class JdkStreams { + + private JdkStreams() {} + + public static final Function, JavaStream> FLOAT = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedFloatStream::new), + new SimpleImmutableEntry<>("generic", GenericStream::new)); + + public static final Function, JavaStream> DOUBLE = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedDoubleStream::new), + new SimpleImmutableEntry<>("generic", GenericStream::new)); +} diff --git a/java-stream/src/main/java/javastream/jdk/PlainStream.java b/java-stream/src/main/java/javastream/jdk/PlainStream.java new file mode 100644 index 0000000..f9281e8 --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/PlainStream.java @@ -0,0 +1,26 @@ +package javastream.jdk; + +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.function.Function; +import javastream.JavaStream; +import javastream.JavaStream.EnumeratedStream; +import javastream.Main.Config; + +public final class PlainStream { + + private PlainStream() {} + + public static final Function, JavaStream> FLOAT = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedPlainFloatStream::new), + new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); + + public static final Function, JavaStream> DOUBLE = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedPlainDoubleStream::new), + new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); +} diff --git a/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java b/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java new file mode 100644 index 0000000..26406a6 --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedDoubleStream extends JavaStream { + + private final double[] a, b, c; + + SpecialisedDoubleStream(Config config) { + super(config); + this.a = new double[config.options.arraysize]; + this.b = new double[config.options.arraysize]; + this.c = new double[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach( + i -> { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + }); + } + + @Override + public void copy() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i]); + } + + @Override + public void mul() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> b[i] = config.scalar * c[i]); + } + + @Override + public void add() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i] + b[i]); + } + + @Override + public void triad() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] = b[i] + config.scalar * c[i]); + } + + @Override + public void nstream() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] += b[i] + config.scalar * c[i]); + } + + @Override + public Double dot() { + return IntStream.range(0, config.options.arraysize) + .parallel() + .mapToDouble(i -> a[i] * b[i]) + .reduce(0f, Double::sum); + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java b/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java new file mode 100644 index 0000000..6c414c1 --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedFloatStream extends JavaStream { + + private final float[] a, b, c; + + SpecialisedFloatStream(Config config) { + super(config); + this.a = new float[config.options.arraysize]; + this.b = new float[config.options.arraysize]; + this.c = new float[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach( + i -> { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + }); + } + + @Override + public void copy() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i]); + } + + @Override + public void mul() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> b[i] = config.scalar * c[i]); + } + + @Override + public void add() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i] + b[i]); + } + + @Override + public void triad() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] = b[i] + config.scalar * c[i]); + } + + @Override + public void nstream() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] += b[i] + config.scalar * c[i]); + } + + @Override + public Float dot() { + return IntStream.range(0, config.options.arraysize) // + .parallel() + .mapToObj(i -> a[i] * b[i]) // XXX there isn't a specialised Stream for floats + .reduce(0f, Float::sum); + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java b/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java new file mode 100644 index 0000000..afda2ef --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedPlainDoubleStream extends JavaStream { + + private final double[] a; + private final double[] b; + private final double[] c; + + SpecialisedPlainDoubleStream(Config config) { + super(config); + this.a = new double[config.options.arraysize]; + this.b = new double[config.options.arraysize]; + this.c = new double[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = config.scalar * c[i]; + } + } + + @Override + public void add() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i] + b[i]; + } + } + + @Override + public void triad() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = b[i] + config.scalar * c[i]; + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] += b[i] + config.scalar * c[i]; + } + } + + @Override + public Double dot() { + double acc = 0f; + for (int i = 0; i < config.options.arraysize; i++) { + acc += a[i] * b[i]; + } + return acc; + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java b/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java new file mode 100644 index 0000000..9ccee53 --- /dev/null +++ b/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedPlainFloatStream extends JavaStream { + + private final float[] a; + private final float[] b; + private final float[] c; + + SpecialisedPlainFloatStream(Config config) { + super(config); + this.a = new float[config.options.arraysize]; + this.b = new float[config.options.arraysize]; + this.c = new float[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = config.scalar * c[i]; + } + } + + @Override + public void add() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i] + b[i]; + } + } + + @Override + public void triad() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = b[i] + config.scalar * c[i]; + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] += b[i] + config.scalar * c[i]; + } + } + + @Override + public Float dot() { + float acc = 0f; + for (int i = 0; i < config.options.arraysize; i++) { + acc += a[i] * b[i]; + } + return acc; + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java b/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java new file mode 100644 index 0000000..d936df6 --- /dev/null +++ b/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java @@ -0,0 +1,98 @@ +package javastream.tornadovm; + +import java.util.List; +import java.util.stream.Collectors; +import javastream.JavaStream; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskSchedule; +import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.common.TornadoDevice; +import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; + +abstract class GenericTornadoVMStream extends JavaStream { + + protected final TornadoDevice device; + + protected TaskSchedule copyTask; + protected TaskSchedule mulTask; + protected TaskSchedule addTask; + protected TaskSchedule triadTask; + protected TaskSchedule nstreamTask; + protected TaskSchedule dotTask; + + GenericTornadoVMStream(Config config) { + super(config); + + try { + TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime(); + List devices = TornadoVMStreams.enumerateDevices(runtime); + device = devices.get(config.options.device); + + if (config.options.isVerboseBenchmark()) { + System.out.println("Using TornadoVM device:"); + System.out.println(" - Name : " + device.getDescription()); + System.out.println(" - Id : " + device.getDeviceName()); + System.out.println(" - Platform : " + device.getPlatformName()); + System.out.println(" - Backend : " + device.getTornadoVMBackend().name()); + } + } catch (Throwable e) { + throw new RuntimeException( + "Unable to initialise TornadoVM, make sure you are running the binary with the `tornado -jar ...` wrapper and not `java -jar ...`", + e); + } + } + + protected static TaskSchedule mkSchedule() { + return new TaskSchedule(""); + } + + @Override + public List listDevices() { + return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() + .map(d -> d.getDescription() + "(" + d.getDeviceName() + ")") + .collect(Collectors.toList()); + } + + @Override + public void initArrays() { + this.copyTask.warmup(); + this.mulTask.warmup(); + this.addTask.warmup(); + this.triadTask.warmup(); + this.nstreamTask.warmup(); + this.dotTask.warmup(); + } + + @Override + public void copy() { + this.copyTask.execute(); + } + + @Override + public void mul() { + this.mulTask.execute(); + } + + @Override + public void add() { + this.addTask.execute(); + } + + @Override + public void triad() { + this.triadTask.execute(); + } + + @Override + public void nstream() { + this.nstreamTask.execute(); + } + + protected abstract T getSum(); + + @Override + public T dot() { + this.dotTask.execute(); + return getSum(); + } +} diff --git a/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java b/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java new file mode 100644 index 0000000..7712e31 --- /dev/null +++ b/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java @@ -0,0 +1,88 @@ +package javastream.tornadovm; + +import java.util.Arrays; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.annotations.Parallel; +import uk.ac.manchester.tornado.api.annotations.Reduce; + +final class SpecialisedDouble extends GenericTornadoVMStream { + + @SuppressWarnings("ManualArrayCopy") + private static void copy(int size, double[] a, double[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i]; + } + } + + private static void mul(int size, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + b[i] = scalar * c[i]; + } + } + + private static void add(int size, double[] a, double[] b, double[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } + } + + private static void triad(int size, double[] a, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] + scalar * c[i]; + } + } + + private static void nstream(int size, double[] a, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] * scalar * c[i]; + } + } + + private static void dot_( + double[] a, double[] b, @Reduce double[] acc) { // prevent name clash with CL's dot + acc[0] = 0; + for (@Parallel int i = 0; i < a.length; i++) { + acc[0] += a[i] * b[i]; + } + } + + private final double[] a, b, c; + private final double[] dotSum; + + @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + SpecialisedDouble(Config config) { + super(config); + final int size = config.options.arraysize; + final double scalar = config.scalar; + a = new double[size]; + b = new double[size]; + c = new double[size]; + dotSum = new double[1]; + this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c); + this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar); + this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c); + this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar); + this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar); + this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum); + } + + @Override + public void initArrays() { + super.initArrays(); + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + TornadoVMStreams.xferToDevice(device, a, b, c); + } + + @Override + protected Double getSum() { + return dotSum[0]; + } + + @Override + public Data data() { + TornadoVMStreams.xferFromDevice(device, a, b, c); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java b/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java new file mode 100644 index 0000000..e61cfe9 --- /dev/null +++ b/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java @@ -0,0 +1,88 @@ +package javastream.tornadovm; + +import java.util.Arrays; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.annotations.Parallel; +import uk.ac.manchester.tornado.api.annotations.Reduce; + +final class SpecialisedFloat extends GenericTornadoVMStream { + + @SuppressWarnings("ManualArrayCopy") + private static void copy(int size, float[] a, float[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i]; + } + } + + private static void mul(int size, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + b[i] = scalar * c[i]; + } + } + + private static void add(int size, float[] a, float[] b, float[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } + } + + private static void triad(int size, float[] a, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] + scalar * c[i]; + } + } + + private static void nstream(int size, float[] a, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] * scalar * c[i]; + } + } + + private static void dot_( + float[] a, float[] b, @Reduce float[] acc) { // prevent name clash with CL's dot + acc[0] = 0; + for (@Parallel int i = 0; i < a.length; i++) { + acc[0] += a[i] * b[i]; + } + } + + private final float[] a, b, c; + private final float[] dotSum; + + @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + SpecialisedFloat(Config config) { + super(config); + final int size = config.options.arraysize; + final float scalar = config.scalar; + a = new float[size]; + b = new float[size]; + c = new float[size]; + dotSum = new float[1]; + this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c); + this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar); + this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c); + this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar); + this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar); + this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum); + } + + @Override + public void initArrays() { + super.initArrays(); + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + TornadoVMStreams.xferToDevice(device, a, b, c); + } + + @Override + protected Float getSum() { + return dotSum[0]; + } + + @Override + public Data data() { + TornadoVMStreams.xferFromDevice(device, a, b, c); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java b/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java new file mode 100644 index 0000000..68eecad --- /dev/null +++ b/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java @@ -0,0 +1,42 @@ +package javastream.tornadovm; + +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.common.TornadoDevice; +import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState; +import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; + +public final class TornadoVMStreams { + + private TornadoVMStreams() {} + + static void xferToDevice(TornadoDevice device, Object... xs) { + for (Object x : xs) { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + List writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); + if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); + } + } + + static void xferFromDevice(TornadoDevice device, Object... xs) { + for (Object x : xs) { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn(); + } + } + + static List enumerateDevices(TornadoRuntimeCI runtime) { + return IntStream.range(0, runtime.getNumDrivers()) + .mapToObj(runtime::getDriver) + .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) + .collect(Collectors.toList()); + } + + public static final Function, JavaStream> FLOAT = SpecialisedFloat::new; + public static final Function, JavaStream> DOUBLE = SpecialisedDouble::new; +} diff --git a/java-stream/src/test/java/javastream/SmokeTest.java b/java-stream/src/test/java/javastream/SmokeTest.java new file mode 100644 index 0000000..2ceca44 --- /dev/null +++ b/java-stream/src/test/java/javastream/SmokeTest.java @@ -0,0 +1,93 @@ +package javastream; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class SmokeTest { + + // taken from https://stackoverflow.com/a/32146095/896997 + private static Stream> ofCombinations( + List> collections, List current) { + return collections.isEmpty() + ? Stream.of(current) + : collections.get(0).stream() + .flatMap( + e -> { + List list = new ArrayList<>(current); + list.add(e); + return ofCombinations(collections.subList(1, collections.size()), list); + }); + } + + @SuppressWarnings("unused") + private static Stream options() { + + LinkedHashMap> impls = new LinkedHashMap<>(); + impls.put("jdk-stream", Arrays.asList(0, 1)); + impls.put("jdk-plain", Arrays.asList(0, 1)); + // skip aparapi as none of the jdk fallbacks work correctly + // skip tornadovm as it has no jdk fallback + + List configs = + impls.entrySet().stream() + .flatMap( + e -> + Stream.concat(Stream.of(""), e.getValue().stream().map(i -> "--device " + i)) + .map(d -> "--impl " + e.getKey() + " " + d)) + .collect(Collectors.toList()); + + return ofCombinations( + new ArrayList<>( + Arrays.asList( + configs, + Arrays.asList("", "--csv"), + // XXX floats usually have a 1.0^-5 error which misses 10^-8 + Arrays.asList("", "--float --dot-tolerance 1.0e-5"), + Arrays.asList("", "--triad-only", "--nstream-only"), + Arrays.asList("", "--mibibytes"))), + Collections.emptyList()) + .map( + xs -> + Arguments.of( + xs.stream() // + .map(String::trim) // + .collect(Collectors.joining(" ")) + .trim())); + } + + @ParameterizedTest + @MethodSource("options") + void testIt(String args) { + String line = "--arraysize 2048 " + args; + + // redirect stdout/stderr and only print if anything fails + ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + ByteArrayOutputStream errContent = new ByteArrayOutputStream(); + PrintStream originalOut = System.out; + PrintStream originalErr = System.err; + + System.setOut(new PrintStream(outContent)); + System.setErr(new PrintStream(errContent)); + int run = Main.run(line.split("\\s+")); + System.setOut(originalOut); + System.setErr(originalErr); + + if (run != 0) { + System.out.println(outContent); + System.err.println(errContent); + Assertions.assertEquals(0, run, "`" + line + "` did not return 0"); + } + } +} diff --git a/scala-stream/.bsp/sbt.json b/scala-stream/.bsp/sbt.json new file mode 100644 index 0000000..2e1edb1 --- /dev/null +++ b/scala-stream/.bsp/sbt.json @@ -0,0 +1 @@ +{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]} \ No newline at end of file diff --git a/scala-stream/.gitignore b/scala-stream/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/scala-stream/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/scala-stream/.jvmopts b/scala-stream/.jvmopts new file mode 100644 index 0000000..c1ef295 --- /dev/null +++ b/scala-stream/.jvmopts @@ -0,0 +1,2 @@ +-Xmx4096m +-Xss4m \ No newline at end of file diff --git a/scala-stream/.scalafmt.conf b/scala-stream/.scalafmt.conf new file mode 100644 index 0000000..8c7d0c8 --- /dev/null +++ b/scala-stream/.scalafmt.conf @@ -0,0 +1,34 @@ +version = "3.0.0-RC2" +runner.dialect = scala3 + +style = defaultWithAlign + +maxColumn = 100 + +align.preset = more + +rewrite.rules = [ + AvoidInfix + RedundantBraces + RedundantParens + AsciiSortImports + PreferCurlyFors +] + +rewrite.neverInfix.excludeFilters = [until + to + by + eq + ne + "should.*" + "contain.*" + "must.*" + in + be + taggedAs + thrownBy + synchronized + have + when + size + theSameElementsAs] \ No newline at end of file diff --git a/scala-stream/README.md b/scala-stream/README.md new file mode 100644 index 0000000..bf0e3f4 --- /dev/null +++ b/scala-stream/README.md @@ -0,0 +1,102 @@ +ScalaStream +=========== + +This is an implementation of BabelStream +in [Scala 3](https://docs.scala-lang.org/scala3/new-in-scala3.html) on the JVM. In theory, this +implementation also covers Java. Scala and Java, like any other programming language, has its own +ecosystem of library supported parallel programming frameworks, we currently implement the +following: + +* Parallel streams (introduced in Java 8) - `src/main/scala/scalastream/J8SStream.scala` +* [Scala Parallel Collections](https://github.com/scala/scala-parallel-collections) + - `src/main/scala/scalastream/ParStream.scala` + +As the benchmark is relatively simple, we also implement some baselines: + +* Single threaded Scala `for` (i.e `foreach` sugar) - `src/main/scala/scalastream/PlainStream.scala` +* Manually parallelism with Java executors - `src/main/scala/scalastream/ThreadedStream.scala` + +### Performance considerations + +As Scala 3 defaults to Scala 2.13's standard library, we roll our own `Fractional` typeclass with +liberal use of inlining and specialisation. This is motivated by 2.13 stdlib's lack of +specialisation for primitives types on the default `Fractional` and `Numeric` typeclasses. + +The use of [Spire](https://github.com/typelevel/spire) to mitigate this was attempted, however, due +to its use of Scala 2 macros, it currently doesn't compile with Scala 3. + +### Build & Run + +Prerequisites + +* JDK >= 8 on any of its supported platform; known working implementations: + - OpenJDK + distributions ([Amazon Corretto](https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/downloads-list.html) + , [Azul](https://www.azul.com/downloads/?version=java-11-lts&package=jdk) + , [AdoptOpenJDK](https://adoptopenjdk.net/), etc) + - Oracle Graal CE/EE 8+ + +To run the benchmark, first create a binary: + +```shell +> ./sbt assembly +``` + +The binary will be located at `./target/scala-3.0.0/scala-stream.jar`. Run it with: + +```shell +> java -version +openjdk version "11.0.11" 2021-04-20 +OpenJDK Runtime Environment 18.9 (build 11.0.11+9) +OpenJDK 64-Bit Server VM 18.9 (build 11.0.11+9, mixed mode, sharing) +> java -jar target/scala-3.0.0/scala-stream.jar --help + +``` + +For best results, benchmark with the following JVM flags: + +``` +-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance +-XX:-TieredCompilation # disable C1, go straight to C2 +-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point +``` + +Worked example: + +```shell +> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/scala-3.0.0/scala-stream.jar + +BabelStream +Version: 3.4.0 +Implementation: Scala Parallel Collections; Scala (Java 11.0.11; Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64) +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 4087.077 0.13136 0.24896 0.15480 +Mul 2934.709 0.18294 0.28706 0.21627 +Add 3016.342 0.26698 0.39835 0.31119 +Triad 3016.496 0.26697 0.37612 0.31040 +Dot 2216.096 0.24226 0.41235 0.28264 + +``` + +### Graal Native Image + +The port has partial support for Graal Native Image, to generate one, run: + +```shell +> ./sbt nativeImage +``` + +The ELF binary will be located at `./target/native-image/scala-stream`, relocation should work on +the same architecture the binary is built on. + +There's an ongoing bug with Scala 3 's use of `lazy val`s where the program crashes at declaration +site. Currently, Scala Parallel Collections uses this feature internally, so selecting this device +will crash at runtime. + +The bug originates from the use of `Unsafe` in `lazy val` for thready safety guarantees. It seems +that Graal only supports limited uses of this JVM implementation detail and Scala 3 happens to be on +the unsupported side. \ No newline at end of file diff --git a/scala-stream/build.sbt b/scala-stream/build.sbt new file mode 100644 index 0000000..4194acb --- /dev/null +++ b/scala-stream/build.sbt @@ -0,0 +1,29 @@ +lazy val mainCls = Some("scalastream.App") + +lazy val root = (project in file(".")) + .enablePlugins(NativeImagePlugin) + .settings( + scalaVersion := "3.0.0", + version := "3.4.0", + organization := "uk.ac.bristol.uob-hpc", + organizationName := "University of Bristol", + Compile / mainClass := mainCls, + assembly / mainClass := mainCls, + scalacOptions ~= filterConsoleScalacOptions, + assembly / assemblyJarName := "scala-stream.jar", + nativeImageOptions := Seq( + "--no-fallback", + "-H:ReflectionConfigurationFiles=../../reflect-config.json" + ), + nativeImageVersion := "21.1.0", + (Global / excludeLintKeys) += nativeImageVersion, + name := "scala-stream", + libraryDependencies ++= Seq( + // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part + ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), + // par also uses lazy val at some point, so it doesn't work in nativeImage + "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3", + "net.openhft" % "affinity" % "3.21ea1", + "org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity + ) + ) diff --git a/scala-stream/project/build.properties b/scala-stream/project/build.properties new file mode 100644 index 0000000..19479ba --- /dev/null +++ b/scala-stream/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.5.2 diff --git a/scala-stream/project/plugins.sbt b/scala-stream/project/plugins.sbt new file mode 100644 index 0000000..2c82902 --- /dev/null +++ b/scala-stream/project/plugins.sbt @@ -0,0 +1,6 @@ +addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") +addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") diff --git a/scala-stream/reflect-config.json b/scala-stream/reflect-config.json new file mode 100644 index 0000000..9e8b089 --- /dev/null +++ b/scala-stream/reflect-config.json @@ -0,0 +1,11 @@ +[ + { + "name": "sun.misc.Unsafe", + "fields": [ + { + "name": "theUnsafe", + "allowUnsafeAccess": true + } + ] + } +] \ No newline at end of file diff --git a/scala-stream/sbt b/scala-stream/sbt new file mode 100755 index 0000000..efdfda6 --- /dev/null +++ b/scala-stream/sbt @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +./sbt-dist/bin/sbt "$@" \ No newline at end of file diff --git a/scala-stream/sbt-dist/bin/java9-rt-export.jar b/scala-stream/sbt-dist/bin/java9-rt-export.jar new file mode 100644 index 0000000..cbabfb0 Binary files /dev/null and b/scala-stream/sbt-dist/bin/java9-rt-export.jar differ diff --git a/scala-stream/sbt-dist/bin/sbt b/scala-stream/sbt-dist/bin/sbt new file mode 100755 index 0000000..cca77be --- /dev/null +++ b/scala-stream/sbt-dist/bin/sbt @@ -0,0 +1,177 @@ +#!/usr/bin/env bash + + +### ------------------------------- ### +### Helper methods for BASH scripts ### +### ------------------------------- ### + +realpath () { +( + TARGET_FILE="$1" + FIX_CYGPATH="$2" + + cd "$(dirname "$TARGET_FILE")" + TARGET_FILE=$(basename "$TARGET_FILE") + + COUNT=0 + while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] + do + TARGET_FILE=$(readlink "$TARGET_FILE") + cd "$(dirname "$TARGET_FILE")" + TARGET_FILE=$(basename "$TARGET_FILE") + COUNT=$(($COUNT + 1)) + done + + # make sure we grab the actual windows path, instead of cygwin's path. + if [[ "x$FIX_CYGPATH" != "x" ]]; then + echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")" + else + echo "$(pwd -P)/$TARGET_FILE" + fi +) +} + + +# Uses uname to detect if we're in the odd cygwin environment. +is_cygwin() { + local os=$(uname -s) + case "$os" in + CYGWIN*) return 0 ;; + MINGW*) return 0 ;; + MSYS*) return 0 ;; + *) return 1 ;; + esac +} + +# TODO - Use nicer bash-isms here. +CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi) + + +# This can fix cygwin style /cygdrive paths so we get the +# windows style paths. +cygwinpath() { + local file="$1" + if [[ "$CYGWIN_FLAG" == "true" ]]; then + echo $(cygpath -w $file) + else + echo $file + fi +} + +. "$(dirname "$(realpath "$0")")/sbt-launch-lib.bash" + + +declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" +declare -r sbt_opts_file=".sbtopts" +declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" +declare -r dist_sbt_opts_file="${sbt_home}/conf/sbtopts" +declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt" + +usage() { + cat < path to global settings/plugins directory (default: ~/.sbt) + -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) + -ivy path to local Ivy repository (default: ~/.ivy2) + -mem set memory options (default: $sbt_default_mem, which is $(get_mem_opts)) + -no-share use all local caches; no sharing + -no-global uses global caches, but does not use global ~/.sbt directory. + -jvm-debug Turn on JVM debugging, open at the given port. + -batch Disable interactive mode + + # sbt version (default: from project/build.properties if present, else latest release) + -sbt-version use the specified version of sbt + -sbt-jar use the specified jar as the sbt launcher + -sbt-rc use an RC version of sbt + -sbt-snapshot use a snapshot version of sbt + + # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) + -java-home alternate JAVA_HOME + + # jvm options and output control + JAVA_OPTS environment variable, if unset uses "$java_opts" + .jvmopts if this file exists in the current directory, its contents + are appended to JAVA_OPTS + SBT_OPTS environment variable, if unset uses "$default_sbt_opts" + .sbtopts if this file exists in the current directory, its contents + are prepended to the runner args + /etc/sbt/sbtopts if this file exists, it is prepended to the runner args + -Dkey=val pass -Dkey=val directly to the java runtime + -J-X pass option -X directly to the java runtime + (-J is stripped) + -S-X add -X to sbt's scalacOptions (-S is stripped) + +In the case of duplicated or conflicting options, the order above +shows precedence: JAVA_OPTS lowest, command line options highest. +EOM +} + + + +process_my_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; + -no-share) addJava "$noshare_opts" && shift ;; + -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; + -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; + -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; + -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; + -batch) exec + link=$(expr "$ls" : '.*-> \(.*\)$') + if expr "$link" : '/.*' > /dev/null; then + SCRIPT="$link" + else + SCRIPT=$(dirname "$SCRIPT")/"$link" + fi +done +declare -r sbt_bin_dir="$(dirname "$SCRIPT")" +declare -r sbt_home="$(dirname "$sbt_bin_dir")" + +echoerr () { + echo 1>&2 "$@" +} +vlog () { + [[ $verbose || $debug ]] && echoerr "$@" +} +dlog () { + [[ $debug ]] && echoerr "$@" +} + +jar_file () { + echo "$(cygwinpath "${sbt_home}/bin/sbt-launch.jar")" +} + +acquire_sbt_jar () { + sbt_jar="$(jar_file)" + + if [[ ! -f "$sbt_jar" ]]; then + echoerr "Could not find launcher jar: $sbt_jar" + exit 2 + fi +} + +rt_export_file () { + echo "${sbt_bin_dir}/java9-rt-export.jar" +} + +execRunner () { + # print the arguments one to a line, quoting any containing spaces + [[ $verbose || $debug ]] && echo "# Executing command line:" && { + for arg; do + if printf "%s\n" "$arg" | grep -q ' '; then + printf "\"%s\"\n" "$arg" + else + printf "%s\n" "$arg" + fi + done + echo "" + } + + # THis used to be exec, but we loose the ability to re-hook stty then + # for cygwin... Maybe we should flag the feature here... + "$@" +} + +addJava () { + dlog "[addJava] arg = '$1'" + java_args=( "${java_args[@]}" "$1" ) +} +addSbt () { + dlog "[addSbt] arg = '$1'" + sbt_commands=( "${sbt_commands[@]}" "$1" ) +} +addResidual () { + dlog "[residual] arg = '$1'" + residual_args=( "${residual_args[@]}" "$1" ) +} +addDebugger () { + addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1" +} + +get_mem_opts () { + # if we detect any of these settings in ${JAVA_OPTS} or ${JAVA_TOOL_OPTIONS} we need to NOT output our settings. + # The reason is the Xms/Xmx, if they don't line up, cause errors. + if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + elif [[ "${JAVA_TOOL_OPTIONS}" == *-Xmx* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-Xms* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + elif [[ "${SBT_OPTS}" == *-Xmx* ]] || [[ "${SBT_OPTS}" == *-Xms* ]] || [[ "${SBT_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${SBT_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${SBT_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + else + # a ham-fisted attempt to move some memory settings in concert + # so they need not be messed around with individually. + local mem=${1:-$sbt_default_mem} + local codecache=$(( $mem / 8 )) + (( $codecache > 128 )) || codecache=128 + (( $codecache < 512 )) || codecache=512 + local class_metadata_size=$(( $codecache * 2 )) + if [[ -z $java_version ]]; then + java_version=$(jdk_version) + fi + local class_metadata_opt=$((( $java_version < 8 )) && echo "MaxPermSize" || echo "MaxMetaspaceSize") + + local arg_xms=$([[ "${java_args[@]}" == *-Xms* ]] && echo "" || echo "-Xms${mem}m") + local arg_xmx=$([[ "${java_args[@]}" == *-Xmx* ]] && echo "" || echo "-Xmx${mem}m") + local arg_rccs=$([[ "${java_args[@]}" == *-XX:ReservedCodeCacheSize* ]] && echo "" || echo "-XX:ReservedCodeCacheSize=${codecache}m") + local arg_meta=$([[ "${java_args[@]}" == *-XX:${class_metadata_opt}* && ! (( $java_version < 8 )) ]] && echo "" || echo "-XX:${class_metadata_opt}=${class_metadata_size}m") + + echo "${arg_xms} ${arg_xmx} ${arg_rccs} ${arg_meta}" + fi +} + +get_gc_opts () { + local older_than_9=$(( $java_version < 9 )) + + if [[ "$older_than_9" == "1" ]]; then + # don't need to worry about gc + echo "" + elif [[ "${JAVA_OPTS}" =~ Use.*GC ]] || [[ "${JAVA_TOOL_OPTIONS}" =~ Use.*GC ]] || [[ "${SBT_OPTS}" =~ Use.*GC ]] ; then + # GC arg has been passed in - don't change + echo "" + else + # Java 9+ so revert to old + echo "-XX:+UseParallelGC" + fi +} + +require_arg () { + local type="$1" + local opt="$2" + local arg="$3" + if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then + echo "$opt requires <$type> argument" + exit 1 + fi +} + +is_function_defined() { + declare -f "$1" > /dev/null +} + +# parses JDK version from the -version output line. +# 8 for 1.8.0_nn, 9 for 9-ea etc, and "no_java" for undetected +jdk_version() { + local result + local lines=$("$java_cmd" -Xms32M -Xmx32M -version 2>&1 | tr '\r' '\n') + local IFS=$'\n' + for line in $lines; do + if [[ (-z $result) && ($line = *"version \""*) ]] + then + local ver=$(echo $line | sed -e 's/.*version "\(.*\)"\(.*\)/\1/; 1q') + # on macOS sed doesn't support '?' + if [[ $ver = "1."* ]] + then + result=$(echo $ver | sed -e 's/1\.\([0-9]*\)\(.*\)/\1/; 1q') + else + result=$(echo $ver | sed -e 's/\([0-9]*\)\(.*\)/\1/; 1q') + fi + fi + done + if [[ -z $result ]] + then + result=no_java + fi + echo "$result" +} + +process_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|-help) usage; exit 1 ;; + -v|-verbose) verbose=1 && shift ;; + -d|-debug) debug=1 && addSbt "-debug" && shift ;; + + -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; + -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; + -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; + -batch) exec /dev/null 2>&1 && { + mkdir -p "$target_preloaded" + rsync -a --ignore-existing "$source_preloaded" "$target_preloaded" + } + } + } +} + +# Detect that we have java installed. +checkJava() { + local required_version="$1" + # Now check to see if it's a good enough version + local good_enough="$(expr $java_version ">=" $required_version)" + if [[ "$java_version" == "" ]]; then + echo + echo "No Java Development Kit (JDK) installation was detected." + echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download. + echo + exit 1 + elif [[ "$good_enough" != "1" ]]; then + echo + echo "The Java Development Kit (JDK) installation you have is not up to date." + echo $script_name requires at least version $required_version+, you have + echo version $java_version + echo + echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download + echo a valid JDK and install before running $script_name. + echo + exit 1 + fi +} + +copyRt() { + local at_least_9="$(expr $java_version ">=" 9)" + if [[ "$at_least_9" == "1" ]]; then + rtexport=$(rt_export_file) + # The grep for java9-rt-ext- matches the filename prefix printed in Export.java + java9_ext=$("$java_cmd" ${JAVA_OPTS} ${SBT_OPTS:-$default_sbt_opts} ${java_args[@]} \ + -jar "$rtexport" --rt-ext-dir | grep java9-rt-ext-) + java9_rt=$(echo "$java9_ext/rt.jar") + vlog "[copyRt] java9_rt = '$java9_rt'" + if [[ ! -f "$java9_rt" ]]; then + echo Copying runtime jar. + mkdir -p "$java9_ext" + execRunner "$java_cmd" \ + ${JAVA_OPTS} \ + ${SBT_OPTS:-$default_sbt_opts} \ + ${java_args[@]} \ + -jar "$rtexport" \ + "${java9_rt}" + fi + addJava "-Dscala.ext.dirs=${java9_ext}" + fi +} + +run() { + # process the combined args, then reset "$@" to the residuals + process_args "$@" + set -- "${residual_args[@]}" + argumentCount=$# + + # Copy preloaded repo to user's preloaded directory + syncPreloaded + + # no jar? download it. + [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || { + # still no jar? uh-oh. + echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar" + exit 1 + } + + # TODO - java check should be configurable... + checkJava "6" + + # Java 9 support + copyRt + + #If we're in cygwin, we should use the windows config, and terminal hacks + if [[ "$CYGWIN_FLAG" == "true" ]]; then + stty -icanon min 1 -echo > /dev/null 2>&1 + addJava "-Djline.terminal=jline.UnixTerminal" + addJava "-Dsbt.cygwin=true" + fi + + # run sbt + execRunner "$java_cmd" \ + $(get_mem_opts $sbt_mem) \ + $(get_gc_opts) \ + ${JAVA_OPTS} \ + ${SBT_OPTS:-$default_sbt_opts} \ + ${java_args[@]} \ + -jar "$sbt_jar" \ + "${sbt_commands[@]}" \ + "${residual_args[@]}" + + exit_code=$? + + # Clean up the terminal from cygwin hacks. + if [[ "$CYGWIN_FLAG" == "true" ]]; then + stty icanon echo > /dev/null 2>&1 + fi + exit $exit_code +} diff --git a/scala-stream/sbt-dist/bin/sbt-launch.jar b/scala-stream/sbt-dist/bin/sbt-launch.jar new file mode 100644 index 0000000..26ab884 Binary files /dev/null and b/scala-stream/sbt-dist/bin/sbt-launch.jar differ diff --git a/scala-stream/sbt-dist/bin/sbt.bat b/scala-stream/sbt-dist/bin/sbt.bat new file mode 100644 index 0000000..1827961 --- /dev/null +++ b/scala-stream/sbt-dist/bin/sbt.bat @@ -0,0 +1,212 @@ +@REM SBT launcher script +@REM +@REM Environment: +@REM JAVA_HOME - location of a JDK home dir (mandatory) +@REM SBT_OPTS - JVM options (optional) +@REM Configuration: +@REM sbtconfig.txt found in the SBT_HOME. + +@REM ZOMG! We need delayed expansion to build up CFG_OPTS later +@setlocal enabledelayedexpansion + +@echo off +set SBT_HOME=%~dp0 +set SBT_ARGS= + +rem FIRST we load the config file of extra options. +set FN=%SBT_HOME%\..\conf\sbtconfig.txt +set CFG_OPTS= +FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( + set DO_NOT_REUSE_ME=%%i + rem ZOMG (Part #2) WE use !! here to delay the expansion of + rem CFG_OPTS, otherwise it remains "" for this loop. + set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! +) + +rem poor man's jenv (which is not available on Windows) +IF DEFINED JAVA_HOMES ( + IF EXIST .java-version FOR /F %%A IN (.java-version) DO ( + SET JAVA_HOME=%JAVA_HOMES%\%%A + SET JDK_HOME=%JAVA_HOMES%\%%A + ) +) +rem must set PATH or wrong javac is used for java projects +IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%" + +rem users can set JAVA_OPTS via .jvmopts (sbt-extras style) +IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO ( + SET _jvmopts_line=%%A + IF NOT "!_jvmopts_line:~0,1!"=="#" ( + SET JAVA_OPTS=%%A !JAVA_OPTS! + ) +) +rem We use the value of the JAVACMD environment variable if defined +set _JAVACMD=%JAVACMD% + +if "%_JAVACMD%"=="" ( + if not "%JAVA_HOME%"=="" ( + if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" + ) +) + +if "%_JAVACMD%"=="" set _JAVACMD=java + +rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. +set _JAVA_OPTS=%JAVA_OPTS% +if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% + +set INIT_SBT_VERSION=1.2.8 + +:args_loop +if "%~1" == "" goto args_end + +if "%~1" == "-jvm-debug" ( + set JVM_DEBUG=true + set /a JVM_DEBUG_PORT=5005 2>nul >nul +) else if "!JVM_DEBUG!" == "true" ( + set /a JVM_DEBUG_PORT=%1 2>nul >nul + if not "%~1" == "!JVM_DEBUG_PORT!" ( + set SBT_ARGS=!SBT_ARGS! %1 + ) +) else if /I "%~1" == "new" ( + set sbt_new=true + set SBT_ARGS=!SBT_ARGS! %1 +) else ( + set SBT_ARGS=!SBT_ARGS! %1 +) + +shift +goto args_loop +:args_end + +rem Confirm a user's intent if the current directory does not look like an sbt +rem top-level directory and the "new" command was not given. +if not exist build.sbt ( + if not exist project\ ( + if not defined sbt_new ( + echo [warn] Neither build.sbt nor a 'project' directory in the current directory: %CD% + setlocal +:confirm + echo c^) continue + echo q^) quit + + set /P reply=?^ + if /I "!reply!" == "c" ( + goto confirm_end + ) else if /I "!reply!" == "q" ( + exit /B 1 + ) + + goto confirm +:confirm_end + endlocal + ) + ) +) + +call :process + +call :checkjava + +call :copyrt + +if defined JVM_DEBUG_PORT ( + set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT! +) + +call :sync_preloaded + +call :run %SBT_ARGS% + +if ERRORLEVEL 1 goto error +goto end + +:run + +"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* +goto :eof + +:process +rem Parses x out of 1.x; for example 8 out of java version 1.8.0_xx +rem Otherwise, parses the major version; 9 out of java version 9-ea +set JAVA_VERSION=0 +for /f "tokens=3" %%g in ('"%_JAVACMD%" -Xms32M -Xmx32M -version 2^>^&1 ^| findstr /i version') do ( + set JAVA_VERSION=%%g +) +set JAVA_VERSION=%JAVA_VERSION:"=% +for /f "delims=.-_ tokens=1-2" %%v in ("%JAVA_VERSION%") do ( + if /I "%%v" EQU "1" ( + set JAVA_VERSION=%%w + ) else ( + set JAVA_VERSION=%%v + ) +) +exit /B 0 + +:checkjava +set required_version=6 +if /I %JAVA_VERSION% GEQ %required_version% ( + exit /B 0 +) +echo. +echo The Java Development Kit (JDK) installation you have is not up to date. +echo sbt requires at least version %required_version%+, you have +echo version %JAVA_VERSION% +echo. +echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download +echo a valid JDK and install before running sbt. +echo. +exit /B 1 + +:copyrt +if /I %JAVA_VERSION% GEQ 9 ( + set rtexport=!SBT_HOME!java9-rt-export.jar + + "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" --rt-ext-dir > "%TEMP%.\rtext.txt" + set /p java9_ext= < "%TEMP%.\rtext.txt" + set java9_rt=!java9_ext!\rt.jar + + if not exist "!java9_rt!" ( + mkdir "!java9_ext!" + "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" "!java9_rt!" + ) + set _JAVA_OPTS=!_JAVA_OPTS! -Dscala.ext.dirs="!java9_ext!" + + rem check to see if a GC has been set in the opts + echo !_JAVA_OPTS! | findstr /r "Use.*GC" >nul + if ERRORLEVEL 1 ( + rem don't have a GC set - revert to old GC + set _JAVA_OPTS=!_JAVA_OPTS! -XX:+UseParallelGC + ) +) +exit /B 0 + +:sync_preloaded +if "%INIT_SBT_VERSION%"=="" ( + rem FIXME: better %INIT_SBT_VERSION% detection + FOR /F "tokens=* USEBACKQ" %%F IN (`dir /b "%SBT_HOME%\..\lib\local-preloaded\org.scala-sbt\sbt" /B`) DO ( + SET INIT_SBT_VERSION=%%F + ) +) +set PRELOAD_SBT_JAR="%UserProfile%\.sbt\preloaded\org.scala-sbt\sbt\%INIT_SBT_VERSION%\jars\sbt.jar" +if /I %JAVA_VERSION% GEQ 8 ( + where robocopy >nul 2>nul + if %ERRORLEVEL% equ 0 ( + REM echo %PRELOAD_SBT_JAR% + if not exist %PRELOAD_SBT_JAR% ( + if exist "%SBT_HOME%\..\lib\local-preloaded\" ( + echo "about to robocopy" + robocopy "%SBT_HOME%\..\lib\local-preloaded" "%UserProfile%\.sbt\preloaded" /E + ) + ) + ) +) +exit /B 0 + +:error +@endlocal +exit /B 1 + +:end +@endlocal +exit /B 0 diff --git a/scala-stream/sbt-dist/conf/sbtconfig.txt b/scala-stream/sbt-dist/conf/sbtconfig.txt new file mode 100644 index 0000000..a4da43e --- /dev/null +++ b/scala-stream/sbt-dist/conf/sbtconfig.txt @@ -0,0 +1,14 @@ +# Set the java args to high + +-Xmx512M + +-XX:MaxPermSize=256m + +-XX:ReservedCodeCacheSize=128m + + + +# Set the extra SBT options + +-Dsbt.log.format=true + diff --git a/scala-stream/sbt-dist/conf/sbtopts b/scala-stream/sbt-dist/conf/sbtopts new file mode 100644 index 0000000..f018465 --- /dev/null +++ b/scala-stream/sbt-dist/conf/sbtopts @@ -0,0 +1,49 @@ +# ------------------------------------------------ # +# The SBT Configuration file. # +# ------------------------------------------------ # + + +# Disable ANSI color codes +# +#-no-colors + +# Starts sbt even if the current directory contains no sbt project. +# +-sbt-create + +# Path to global settings/plugins directory (default: ~/.sbt) +# +#-sbt-dir /etc/sbt + +# Path to shared boot directory (default: ~/.sbt/boot in 0.11 series) +# +#-sbt-boot ~/.sbt/boot + +# Path to local Ivy repository (default: ~/.ivy2) +# +#-ivy ~/.ivy2 + +# set memory options +# +#-mem + +# Use local caches for projects, no sharing. +# +#-no-share + +# Put SBT in offline mode. +# +#-offline + +# Sets the SBT version to use. +#-sbt-version 0.11.3 + +# Scala version (default: latest release) +# +#-scala-home +#-scala-version + +# java version (default: java from PATH, currently $(java -version |& grep version)) +# +#-java-home + diff --git a/scala-stream/src/main/scala/scalastream/J8SStream.scala b/scala-stream/src/main/scala/scalastream/J8SStream.scala new file mode 100644 index 0000000..ba509a5 --- /dev/null +++ b/scala-stream/src/main/scala/scalastream/J8SStream.scala @@ -0,0 +1,44 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.reflect.{ClassTag, classTag} + +class J8SStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + inline private def stream = + java.util.stream.IntStream.range(0, config.options.arraysize).parallel() + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + stream.forEach { i => + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + } + + override inline def copy(): Unit = stream.forEach(i => c(i) = a(i)) + override inline def mul(): Unit = stream.forEach(i => b(i) = scalar * c(i)) + override inline def add(): Unit = stream.forEach(i => c(i) = a(i) + b(i)) + override inline def triad(): Unit = stream.forEach(i => a(i) = b(i) + scalar * c(i)) + override inline def nstream(): Unit = stream.forEach(i => a(i) = b(i) * scalar * c(i)) + override inline def dot(): A = + // horrible special-case for double, there isn't a mapToFloat so we give up on that + val cls = classTag[A].runtimeClass + if java.lang.Double.TYPE == cls then + stream + .mapToDouble(i => (a(i) * b(i)).asInstanceOf[Double]) + .reduce(0, (l: Double, r: Double) => l + r) + .asInstanceOf[A] + else stream.mapToObj[A](i => a(i) * b(i)).reduce(0.fractional, (l: A, r: A) => l + r) + + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/scala-stream/src/main/scala/scalastream/ParStream.scala b/scala-stream/src/main/scala/scalastream/ParStream.scala new file mode 100644 index 0000000..bb146a2 --- /dev/null +++ b/scala-stream/src/main/scala/scalastream/ParStream.scala @@ -0,0 +1,36 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.collection.parallel.CollectionConverters._ +import scala.reflect.ClassTag +class ParStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + inline private def indices = (0 until config.options.arraysize).par + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + + for i <- indices do + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + + override inline def copy(): Unit = for i <- indices do c(i) = a(i) + override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) + override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) + override inline def triad(): Unit = for i <- indices do a(i) = b(i) + scalar * c(i) + override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) + override inline def dot(): A = + indices.aggregate[A](0.fractional)((acc, i) => acc + (a(i) * b(i)), _ + _) + + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/scala-stream/src/main/scala/scalastream/PlainStream.scala b/scala-stream/src/main/scala/scalastream/PlainStream.scala new file mode 100644 index 0000000..2b42571 --- /dev/null +++ b/scala-stream/src/main/scala/scalastream/PlainStream.scala @@ -0,0 +1,31 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.reflect.ClassTag +class PlainStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + override inline def initArrays(): Unit = + a = Array.fill(config.options.arraysize)(config.init._1) + b = Array.fill(config.options.arraysize)(config.init._2) + c = Array.fill(config.options.arraysize)(config.init._3) + + private inline def indices = 0 until config.options.arraysize + + override inline def copy(): Unit = for i <- indices do c(i) = a(i) + override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) + override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) + override inline def triad(): Unit = for i <- indices do a(i) = b(i) + (scalar * c(i)) + override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) + override inline def dot(): A = + var acc: A = 0.fractional + for i <- indices do acc = acc + (a(i) * b(i)) + acc + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/scala-stream/src/main/scala/scalastream/ScalaStream.scala b/scala-stream/src/main/scala/scalastream/ScalaStream.scala new file mode 100644 index 0000000..4ed90e4 --- /dev/null +++ b/scala-stream/src/main/scala/scalastream/ScalaStream.scala @@ -0,0 +1,369 @@ +package scalastream +import scalastream.App.{Config, Data, Timings} + +import java.util.concurrent.TimeUnit +import scala.collection.immutable.ArraySeq +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.duration.{Duration, FiniteDuration, SECONDS} +import scala.math.{Pi, pow} +import scala.reflect.ClassTag +import scopt.OParser + +transparent trait ScalaStream[@specialized(Float, Double) A]: + + def config: Config[A] + + def initArrays(): Unit + def copy(): Unit + def mul(): Unit + def add(): Unit + def triad(): Unit + def nstream(): Unit + def dot(): A + + transparent inline def timed[R](f: => R): (FiniteDuration, R) = + val start = System.nanoTime() + val r = f + val end = System.nanoTime() + FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r + + inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) = + val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val add = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val triad = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val dot = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + + var lastSum: A = 0.fractional + + for i <- 0 until times do + copy(i) = timed(this.copy())._1 + mul(i) = timed(this.mul())._1 + add(i) = timed(this.add())._1 + triad(i) = timed(this.triad())._1 + val (dot_, sum) = timed(this.dot()) + dot(i) = dot_ + lastSum = sum + val s = lastSum + + ( + Timings( + copy = copy.toVector, + mul = mul.toVector, + add = add.toVector, + triad = triad.toVector, + dot = dot.toVector + ), + s + ) + + def runTriad(times: Int): FiniteDuration = timed(for _ <- 0 until times do triad())._1 + def runNStream(times: Int): Vector[FiniteDuration] = Vector.fill(times)(timed(nstream())._1) + + def data(): Data[A] + + +trait Fractional[@specialized(Double, Float) A]: + def toFractional(f: Float): A + def toFractional(f: Double): A + def compare(x: A, y: A): Int + def add(x: A, y: A): A + def sub(x: A, y: A): A + def mul(x: A, y: A): A + def div(x: A, y: A): A + def abs(x: A): A + extension (x: Float) inline def fractional = toFractional(x) + extension (x: Double) inline def fractional = toFractional(x) + extension (x: Int) inline def fractional = toFractional(x.toFloat) + extension (x: Long) inline def fractional = toFractional(x.toDouble) + extension (x: A) + inline def +(y: A) = add(x, y) + inline def -(y: A) = sub(x, y) + inline def *(y: A) = mul(x, y) + inline def /(y: A) = div(x, y) + inline def >(y: A) = compare(x, y) > 0 + inline def <(y: A) = compare(x, y) < 0 + inline def abs_ = abs(x) +end Fractional + +given FloatFractional: Fractional[Float] with + inline def toFractional(f: Float): Float = f + inline def toFractional(f: Double): Float = f.toFloat + inline def compare(x: Float, y: Float): Int = x.compare(y) + inline def add(x: Float, y: Float): Float = x + y + inline def sub(x: Float, y: Float): Float = x - y + inline def mul(x: Float, y: Float): Float = x * y + inline def div(x: Float, y: Float): Float = x / y + inline def abs(x: Float): Float = math.abs(x) + +given DoubleFractional: Fractional[Double] with + inline def toFractional(f: Float): Double = f.toDouble + inline def toFractional(f: Double): Double = f + inline def compare(x: Double, y: Double): Int = x.compare(y) + inline def add(x: Double, y: Double): Double = x + y + inline def sub(x: Double, y: Double): Double = x - y + inline def mul(x: Double, y: Double): Double = x * y + inline def div(x: Double, y: Double): Double = x / y + inline def abs(x: Double): Double = math.abs(x) + +object App: + + final val Version: String = "3.4.0" + + case class Config[@specialized(Double, Float) A]( + options: Options, + benchmark: Benchmark, + typeSize: Int, + ulp: A, + scalar: A, + init: (A, A, A) + ) + + case class Timings[A](copy: A, mul: A, add: A, triad: A, dot: A) + case class Data[A](@specialized(Double, Float) a: ArraySeq[A], b: ArraySeq[A], c: ArraySeq[A]) + + case class Options( + list: Boolean = false, + device: Int = 0, + numtimes: Int = 100, + arraysize: Int = 33554432, + float: Boolean = false, + triad_only: Boolean = false, + nstream_only: Boolean = false, + csv: Boolean = false, + mibibytes: Boolean = false + ) + + object Options: + val Default = Options() + val builder = OParser.builder[Options] + val parser1 = + import builder._ + OParser.sequence( + programName("scala-stream"), + head("ScalaStream", s"$Version"), + opt[Unit]('l', "list").text("List available devices").action((_, x) => x.copy(list = true)), + opt[Int]('d', "device") + .text(s"Select device at , defaults to ${Default.device}") + .action((v, x) => x.copy(device = v)), + opt[Int]('n', "numtimes") + .text(s"Run the test times (NUM >= 2), defaults to ${Default.numtimes}") + .validate { + case n if n >= 2 => success + case n => failure(s"$n <= 2") + } + .action((n, x) => x.copy(numtimes = n)), + opt[Int]('a', "arraysize") + .text(s"Use elements in the array, defaults to ${Default.arraysize}") + .action((v, x) => x.copy(arraysize = v)), + opt[Unit]('f', "float") + .text("Use floats (rather than doubles)") + .action((_, x) => x.copy(float = true)), + opt[Unit]('t', "triad_only") + .text("Only run triad") + .action((_, x) => x.copy(triad_only = true)), + opt[Unit]('n', "nstream_only") + .text("Only run nstream") + .action((_, x) => x.copy(nstream_only = true)), + opt[Unit]('c', "csv").text("Output as csv table").action((_, x) => x.copy(csv = true)), + opt[Unit]('m', "mibibytes") + .text("Use MiB=2^20 for bandwidth calculation (default MB=10^6)") + .action((_, x) => x.copy(mibibytes = true)), + help('h', "help").text("prints this usage text") + ) + + enum Benchmark: + case All, NStream, Triad + + implicit class RichDuration(private val d: Duration) extends AnyVal: + def seconds: Double = d.toUnit(SECONDS) + + def validate[A: Fractional](vec: Data[A], config: Config[A], dotSum: Option[A] = None): Unit = + + var (goldA, goldB, goldC) = config.init + for _ <- 0 until config.options.numtimes do + config.benchmark match + case Benchmark.All => + goldC = goldA + goldB = config.scalar * goldC + goldC = goldA + goldB + goldA = goldB + config.scalar * goldC + case Benchmark.Triad => + goldA = goldB + config.scalar * goldC + case Benchmark.NStream => + goldA += goldB + config.scalar * goldC + + val tolerance = config.ulp * (100.fractional) + def validateXs(name: String, xs: Seq[A], from: A): Unit = + val error = xs.map(x => (x - from).abs_).fold(0.fractional)(_ + _) / xs.size.fractional + if error > tolerance then + Console.err.println(s"Validation failed on $name. Average error $error ") + + validateXs("a", vec.a, goldA) + validateXs("b", vec.b, goldB) + validateXs("c", vec.c, goldC) + + dotSum.foreach { sum => + val goldSum = (goldA * goldB) * (config.options.arraysize).fractional + val error = ((sum - goldSum) / goldSum).abs_ + if error > 1.fractional / 100000000.fractional then + Console.err.println( + s"Validation failed on sum. Error $error \nSum was $sum but should be $goldSum" + ) + } + + inline def run[A: Fractional: ClassTag]( + name: String, + config: Config[A], + mkStream: Config[A] => ScalaStream[A] + ): Unit = + + val opt = config.options + + val arrayBytes = opt.arraysize * config.typeSize + val totalBytes = arrayBytes * 3 + val (megaScale, megaSuffix, gigaScale, gigaSuffix) = + if !opt.mibibytes then (1.0e-6, "MB", 1.0e-9, "GB") + else (pow(2.0, -20), "MiB", pow(2.0, -30), "GiB") + + if !opt.csv then + + val vendor = System.getProperty("java.vendor") + val ver = System.getProperty("java.version") + val home = System.getProperty("java.home") + println( + s"""BabelStream + |Version: $Version + |Implementation: $name; Scala (Java $ver; $vendor; home=$home)""".stripMargin + ) + + println(s"Running ${config.benchmark match { + case Benchmark.All => "kernels" + case Benchmark.Triad => "triad" + case Benchmark.NStream => "nstream" + }} ${opt.numtimes} times") + + if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}") + + println(s"Precision: ${if opt.float then "float" else "double"}") + println( + f"Array size: ${megaScale * arrayBytes}%.1f $megaSuffix (=${gigaScale * arrayBytes}%.1f $gigaSuffix)" + ) + println( + f"Total size: ${megaScale * totalBytes}%.1f $megaSuffix (=${gigaScale * totalBytes}%.1f $gigaSuffix)" + ) + + def mkRow(xs: Vector[FiniteDuration], name: String, totalBytes: Int) = + val tail = xs.tail + (tail.minOption.map(_.seconds), tail.maxOption.map(_.seconds)) match + case (Some(min), Some(max)) => + val avg = (tail.foldLeft(Duration.Zero)(_ + _) / tail.size.toDouble).seconds + val mbps = megaScale * totalBytes.toDouble / min + if opt.csv then + Vector( + "function" -> name, + "num_times" -> opt.numtimes.toString, + "n_elements" -> opt.arraysize.toString, + "sizeof" -> totalBytes.toString, + s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" -> mbps.toString, + "min_runtime" -> min.toString, + "max_runtime" -> max.toString, + "avg_runtime" -> avg.toString + ) + else + Vector( + "Function" -> name, + s"M${if opt.mibibytes then "i" else ""}Bytes/sec" -> f"$mbps%.3f", + "Min (sec)" -> f"$min%.5f", + "Max" -> f"$max%.5f", + "Average" -> f"$avg%.5f" + ) + case (_, _) => sys.error(s"No min/max element for $name(size=$totalBytes)") + + def tabulate(rows: Vector[(String, String)]*): Unit = rows.toList match + case Nil => sys.error(s"Empty tabulation") + case header :: _ => + val padding = if opt.csv then 0 else 12 + val sep = if opt.csv then "," else "" + println(header.map(_._1.padTo(padding, ' ')).mkString(sep)) + println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n")) + + val stream = mkStream(config) + stream.initArrays() + config.benchmark match + case Benchmark.All => + val (results, sum) = stream.runAll(opt.numtimes) + validate(stream.data(), config, Some(sum)) + tabulate( + mkRow(results.copy, "Copy", 2 * arrayBytes), + mkRow(results.mul, "Mul", 2 * arrayBytes), + mkRow(results.add, "Add", 3 * arrayBytes), + mkRow(results.triad, "Triad", 3 * arrayBytes), + mkRow(results.dot, "Dot", 2 * arrayBytes) + ) + case Benchmark.NStream => + val result = stream.runNStream(opt.numtimes) + validate(stream.data(), config) + tabulate(mkRow(result, "Nstream", 4 * arrayBytes)) + case Benchmark.Triad => + val results = stream.runTriad(opt.numtimes) + val totalBytes = 3 * arrayBytes * opt.numtimes + val bandwidth = megaScale * (totalBytes / results.seconds) + println(f"Runtime (seconds): ${results.seconds}%.5f") + println(f"Bandwidth ($gigaSuffix/s): $bandwidth%.3f ") + + inline def devices[A: Fractional: ClassTag]: Vector[(String, Config[A] => ScalaStream[A])] = + Vector( + "Scala Parallel Collections" -> (ParStream(_)), + "Java 8 Stream" -> (J8SStream(_)), + "Threaded" -> (ThreadStream(_)), + "Serial" -> (PlainStream(_)) + ) + + inline def runWith[A: Fractional: ClassTag](i: Int, config: Config[A]): Unit = + devices[A].lift(i) match + case None => println(s"Device index out of bounds: $i") + case Some((name, mkStream)) => run(name, config, mkStream) + + def main(args: Array[String]): Unit = + + def handleOpt(opt: Options) = + val benchmark = (opt.nstream_only, opt.triad_only) match + case (true, false) => Benchmark.NStream + case (false, true) => Benchmark.Triad + case (false, false) => Benchmark.All + case (true, true) => + throw new RuntimeException( + "Both triad and nstream are enabled, pick one or omit both to run all benchmarks" + ) + + if opt.list then + devices[Float].zipWithIndex.foreach { case ((name, _), i) => println(s"$i: $name") } + else if opt.float then + runWith( + opt.device, + Config( + options = opt, + benchmark = benchmark, + typeSize = 4, // 32bit + ulp = math.ulp(Float.MaxValue), + scalar = 0.4f, + init = (0.1f, 0.2f, 0.0f) + ) + ) + else + runWith( + opt.device, + Config( + options = opt, + benchmark = benchmark, + typeSize = 8, + ulp = math.ulp(Double.MaxValue), + scalar = 0.4, // 64bit + init = (0.1, 0.2, 0.0) + ) + ) + + OParser.parse(Options.parser1, args, Options.Default) match + case Some(config) => handleOpt(config) + case _ => sys.exit(1) diff --git a/scala-stream/src/main/scala/scalastream/ThreadStream.scala b/scala-stream/src/main/scala/scalastream/ThreadStream.scala new file mode 100644 index 0000000..969a71f --- /dev/null +++ b/scala-stream/src/main/scala/scalastream/ThreadStream.scala @@ -0,0 +1,68 @@ +package scalastream + +import net.openhft.affinity.{AffinityStrategies, AffinityThreadFactory} +import scalastream.App.{Config, Data} + +import java.util.concurrent.{Callable, Executors} +import scala.collection.immutable.ArraySeq +import scala.reflect.ClassTag +object ThreadStream {} +class ThreadStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + private val chunks: Int = sys.runtime.availableProcessors() + + private val pool = Executors.newFixedThreadPool( + chunks, + new AffinityThreadFactory("scala-stream", true, AffinityStrategies.DIFFERENT_CORE) + ) + + private val indices = (0 until config.options.arraysize) + .grouped(config.options.arraysize / chunks) + .toSeq + + private inline def forEachAll[C](c: => C)(f: (C, Int) => Unit): Seq[C] = + import scala.jdk.CollectionConverters._ + val xs = pool + .invokeAll( + indices.map { r => + { () => + val ctx = c + r.foreach(f(ctx, _)) + ctx + }: Callable[C] + }.asJavaCollection + ) + .asScala + .map(_.get()) + .toSeq + xs + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + forEachAll(()) { (_, i) => + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + } + () + + class Box(var value: A) + override inline def copy(): Unit = { forEachAll(())((_, i) => c(i) = a(i)); () } + override inline def mul(): Unit = { forEachAll(())((_, i) => b(i) = scalar * c(i)); () } + override inline def add(): Unit = { forEachAll(())((_, i) => c(i) = a(i) + b(i)); () } + override inline def triad(): Unit = { forEachAll(())((_, i) => a(i) = b(i) + scalar * c(i)); () } + override inline def nstream(): Unit = { forEachAll(())((_, i) => a(i) = b(i) * scalar * c(i)); () } + + override inline def dot(): A = + forEachAll(Box(0.fractional))((acc, i) => acc.value = acc.value + (a(i) * b(i))) + .map(_.value) + .fold(0.fractional)(_ + _) + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e38130f..81fa78b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,7 +8,46 @@ project(BabelStream VERSION 3.5 LANGUAGES CXX) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) - + +#set(MODEL SYCL) +#set(SYCL_COMPILER COMPUTECPP) +#set(SYCL_COMPILER_DIR /home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu) +#set(MODEL RAJA) +#set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.13.0/) +#set(ENABLE_CUDA ON) +#set(TARGET NVIDIA) +#set(CUDA_TOOLKIT_ROOT_DIR /opt/cuda-11.2) +#set(CUDA_ARCH sm_70) +#set(BLT_DIR /home/tom/Downloads/blt-0.3.6/) + +#set(MODEL STD) +#set(ARCH cc70) +#set(CXX_EXTRA_FLAGS -v) + +#set(MODEL CUDA) +#set(ARCH sm_70) +#set(CMAKE_CUDA_COMPILER /opt/cuda-11.2/bin/nvcc) + +#set(MODEL OCL) +#set(OpenCL_LIBRARY /opt/rocm-4.0.0/opencl/lib/libOpenCL.so) +#set(OpenCL_INCLUDE_DIR /opt/rocm-4.0.0/opencl/lib) +#set(RELEASE_FLAGS -Ofast) +#set(CXX_EXTRA_FLAGS -O2) + +#set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++) +#set(MODEL OMP) +##set(OFFLOAD "AMD:gfx803") +#set(OFFLOAD "NVIDIA:sm_35") +#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/) + +#set(OFFLOAD "AMD:_70") +#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/ --gcc-toolchain=/home/tom/spack/opt/spack/linux-fedora33-zen2/gcc-10.2.1/gcc-8.3.0-latmjo2hl2yv53255xkwko7k3y7bx2vv) +#set(CXX_EXTRA_LINKER_FLAGS ) +#set(MODEL HIP) + +#set(MODEL KOKKOS) +#set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.3.00/) + # the final executable name set(EXE_NAME babelstream) @@ -73,6 +112,8 @@ register_model(sycl SYCL SYCLStream.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) +register_model(tbb TBB TBBStream.cpp) +register_model(thurst THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 290e87b..7294905 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -134,20 +134,21 @@ setup_aocc() { setup_nvhpc() { echo "Preparing Nvidia HPC SDK" - local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.2/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz" - +# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2" + local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" local bin_dir="$sdk_dir/compilers/bin" "$bin_dir/makelocalrc" "$bin_dir" -x + export_var NVHPC_SDK_DIR "$sdk_dir" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.2/bin/nvcc" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.2" + export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + echo "Installed CUDA versions:" ls "$sdk_dir/cuda" verify_bin_exists "$NVHPC_NVCXX" @@ -208,6 +209,20 @@ setup_raja() { check_size } +setup_tbb() { + echo "Preparing TBB" + local tbb_ver="2021.2.0" + local tarball="oneapi-tbb-$tbb_ver-lin.tgz" + + local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" + # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz" + + get_and_untar "$tarball" "$url" + export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver" + verify_dir_exists "$TBB_LIB" + check_size +} + setup_clang_gcc() { echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list @@ -237,10 +252,11 @@ setup_clang_gcc() { setup_rocm() { wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list + echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update -qq - sudo apt-get install -y -qq rocm-dev + sudo apt-get install -y -qq rocm-dev rocthrust-dev export_var ROCM_PATH "/opt/rocm" + export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" verify_bin_exists "$HIP_CXX" "$HIP_CXX" --version @@ -354,6 +370,7 @@ if [ "$PARALLEL" = true ]; then setup_dpcpp & setup_kokkos & setup_raja & + setup_tbb & wait else setup_cmake @@ -364,6 +381,7 @@ else setup_dpcpp setup_kokkos setup_raja + setup_tbb # these need apt setup_clang_gcc setup_rocm diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 85efd39..70303b3 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -57,7 +57,7 @@ run_build() { local cmake_code=$? "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" - "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" + "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" local cmake_code=$? set -e @@ -92,11 +92,11 @@ run_build() { # GCC_CXX="/usr/bin/g++" # CLANG_CXX="/usr/bin/clang++" -# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x +# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" +# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++" +# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.2/bin/nvcc" +# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.2" +# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++" @@ -110,7 +110,7 @@ run_build() { # HIPSYCL_DIR="/opt/hipsycl/cff515c/" # ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" -# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" +# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" # GCC_STD_PAR_LIB="tbb" # CLANG_STD_PAR_LIB="tbb" @@ -122,7 +122,7 @@ run_build() { AMD_ARCH="gfx_903" NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.2,cc80" +NV_ARCH_CCXY="cuda11.4,cc80" build_gcc() { local name="gcc_build" @@ -139,6 +139,9 @@ build_gcc() { run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -166,6 +169,28 @@ build_gcc() { # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_ARCH=$NV_ARCH" + + # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required="3.15.0" + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + + # FIXME CUDA Thrust + TBB throws the following error: + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined + + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + else + echo "CMake version ${current} < ${required}, skipping Thrust models" + fi + } build_clang() { @@ -188,6 +213,18 @@ build_clang() { run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" + run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } @@ -210,7 +247,11 @@ build_aomp() { } build_hip() { - run_build hip_build "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + local name="hip_build" + + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + + run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } build_icpx() { diff --git a/src/main.cpp b/src/main.cpp index e78d7a1..2791bdc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,6 +25,10 @@ #include "STDStream.h" #elif defined(STD20) #include "STD20Stream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" +#elif defined(THRUST) +#include "ThrustStream.h" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -266,6 +270,14 @@ void run() // Use the C++20 implementation stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(TBB) + // Use the C++20 implementation + stream = new TBBStream(ARRAY_SIZE, deviceIndex); + +#elif defined(THRUST) + // Use the Thrust implementation + stream = new ThrustStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex); diff --git a/src/omp/Makefile b/src/omp/Makefile index 2d8545c..15bab8a 100644 --- a/src/omp/Makefile +++ b/src/omp/Makefile @@ -3,7 +3,8 @@ ifndef COMPILER define compiler_help Set COMPILER to change flags (defaulting to GNU). Available compilers are: - CLANG CRAY GNU GNU_PPC INTEL XL PGI NEC ARMCLANG AOMP + CLANG CRAY GNU GNU_PPC INTEL XL PGI + NEC ARMCLANG AOMP FUJITSU Note: GCC on PPC requires -mcpu=native instead of -march=native so we have a special case for it @@ -49,6 +50,7 @@ COMPILER_XL = xlc++ COMPILER_PGI = pgc++ COMPILER_NEC = /opt/nec/ve/bin/nc++ COMPILER_AOMP = clang++ +COMPILER_FUJITSU=FCC CXX = $(COMPILER_$(COMPILER)) FLAGS_GNU = -O3 -std=c++11 -march=native @@ -61,6 +63,7 @@ FLAGS_PGI = -O3 -std=c++11 FLAGS_NEC = -O4 -finline -std=c++11 FLAGS_ARMCLANG = -O3 -std=c++11 FLAGS_AOMP = -O3 -std=c++11 +FLAGS_FUJITSU=-Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 CXXFLAGS = $(FLAGS_$(COMPILER)) # OpenMP flags for CPUs @@ -73,6 +76,7 @@ OMP_CLANG_CPU = -fopenmp=libomp OMP_XL_CPU = -qsmp=omp -qthreaded OMP_PGI_CPU = -mp OMP_NEC_CPU = -fopenmp +OMP_FUJITSU_CPU=-Kopenmp # OpenMP flags for NVIDIA OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU diff --git a/src/omp/model.cmake b/src/omp/model.cmake index c8dde9f..1955ebc 100644 --- a/src/omp/model.cmake +++ b/src/omp/model.cmake @@ -30,6 +30,9 @@ # ARMClang = ARM Compiler based on Clang (arm.com) # These are only added in CMake 3.20: # NVHPC = NVIDIA HPC SDK Compiler (nvidia.com) +# These are only added in CMake 3.21 +# Fujitsu = Fujitsu HPC compiler (Trad mode) +# FujitsuClang = Fujitsu HPC compiler (Clang mode) # CMAKE_SYSTEM_PROCESSOR is set via `uname -p`, we have: @@ -39,8 +42,9 @@ # -#predefined offload flags based on compiler id - +# predefined offload flags based on compiler id and vendor, +# the format is (COMPILER and VENDOR must be UPPERCASE): +# Compiler: OMP_FLAGS_OFFLOAD__ set(OMP_FLAGS_OFFLOAD_INTEL -qnextgen -fiopenmp -fopenmp-targets=spir64) @@ -56,15 +60,25 @@ set(OMP_FLAGS_OFFLOAD_CLANG_ARCH_FLAG -march=) # prefix only, arch appended by the vendor:arch tuple +# for standard (non-offload) omp, the format is (COMPILER and ARCH must be UPPERCASE): +# Compiler: OMP_FLAGS_CPU__ +# Linker: OMP_LINK_FLAGS_CPU__ + +set(OMP_FLAGS_CPU_FUJITSU + -Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16) +set(OMP_LINK_FLAGS_CPU_FUJITSU + -Kopenmp) + set(OMP_FLAGS_CPU_INTEL -qopt-streaming-stores=always) + set(OMP_FLAGS_CPU_GNU_PPC64LE -mcpu=native) + set(OMP_FLAGS_CPU_XL -O5 -qarch=auto -qtune=auto) -# NEC -set(OMP_FLAGS_CPU_NEC -O4 -finline) +set(OMP_FLAGS_CPU_NEC -O4 -finline) # CMake doesn't detect this so it's meant to be chosen by register_flag_optional(ARCH) register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that supports OpenMP as per CMake detection (and offloading if enabled with `OFFLOAD`)" @@ -122,6 +136,12 @@ macro(setup) ${ARCH} ) + register_append_compiler_and_arch_specific_link_flags( + OMP_LINK_FLAGS_CPU + ${COMPILER} + ${ARCH} + ) + elseif ("${OFFLOAD}" STREQUAL ON) # offload but with custom flags register_definitions(OMP_TARGET_GPU) diff --git a/src/register_models.cmake b/src/register_models.cmake index 247612a..12e7a3d 100644 --- a/src/register_models.cmake +++ b/src/register_models.cmake @@ -20,7 +20,9 @@ # macro(wipe_gcc_style_optimisation_flags VAR) - string(REGEX REPLACE "([\\/\\-]O.)" "" ${VAR} ${${VAR}}) + if(${VAR}) + string(REGEX REPLACE "([\\/\\-]O.)" "" ${VAR} ${${VAR}}) + endif() endmacro() macro(register_link_library) @@ -41,7 +43,7 @@ macro(register_append_link_flags) list(APPEND LINK_FLAGS ${ARGN}) endmacro() -macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH) +function(bind_cxx_and_arch OUT PREFIX CXX ARCH) string(TOUPPER ${CXX} _CXX) string(TOUPPER ${ARCH} _ARCH) set(_CXX_ARCH_SPECIFIC_FLAGS "${${PREFIX}_${_CXX}_${_ARCH}}") @@ -52,6 +54,17 @@ macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH) if (_CXX_ARCH_SPECIFIC_FLAGS) register_append_cxx_flags(ANY ${_CXX_ARCH_SPECIFIC_FLAGS}) endif () + set(${OUT} "${_CXX_ARCH_SPECIFIC_FLAGS}" PARENT_SCOPE) +endfunction() + +macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH) + bind_cxx_and_arch(OUT ${PREFIX} ${CXX} ${ARCH}) + register_append_cxx_flags(ANY ${OUT}) +endmacro() + +macro(register_append_compiler_and_arch_specific_link_flags PREFIX CXX ARCH) + bind_cxx_and_arch(OUT ${PREFIX} ${CXX} ${ARCH}) + register_append_link_flags(${OUT}) endmacro() macro(register_definitions)