Merge branch 'main' into top-level-src

# Conflicts:
#	.github/workflows/main.yaml
#	.gitignore
#	README.md
#	src/CMakeLists.txt
#	src/ci-test-compile.sh
This commit is contained in:
Tom Lin 2021-11-30 14:39:48 +00:00
commit 9326e98172
94 changed files with 8819 additions and 31 deletions

View File

@ -3,7 +3,50 @@ on: [push, pull_request]
jobs:
"test_compile_cpp":
test-java:
runs-on: ubuntu-18.04
defaults:
run:
working-directory: ./java-stream
steps:
- uses: actions/checkout@v2
- name: Test build project
run: ./mvnw clean package
- name: Test run
if: ${{ ! cancelled() }}
run: java -jar target/java-stream.jar --arraysize 2048
test-julia:
runs-on: ubuntu-18.04
defaults:
run:
working-directory: ./JuliaStream.jl
steps:
- uses: actions/checkout@v2
- name: Setup project
run: julia --project -e 'import Pkg; Pkg.instantiate()'
- name: Test run PlainStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/PlainStream.jl --arraysize 2048
- name: Test run ThreadedStream.jl
if: ${{ ! cancelled() }}
run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048
- name: Test run DistributedStream.jl (no flag)
if: ${{ ! cancelled() }}
run: julia --project src/DistributedStream.jl --arraysize 2048
- name: Test run DistributedStream.jl (-p 2)
if: ${{ ! cancelled() }}
run: julia -p 2 --project src/DistributedStream.jl --arraysize 2048
- name: Test run CUDAStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/CUDAStream.jl --list
- name: Test run AMDGPUStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/AMDGPUStream.jl --list
test-cpp:
runs-on: ubuntu-18.04
defaults:
run:

23
.gitignore vendored
View File

@ -1,4 +1,27 @@
cuda-stream
ocl-stream
omp-stream
acc-stream
raja-stream
kokkos-stream
std-stream
sycl-stream
hip-stream
tbb-stream
*.o
*.bc
*.sycl
*.tar
*.gz
*.a
KokkosCore_config.*
.DS_Store
Makefile
build/
cmake-build-*/

View File

@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
- Added nstream kernel from PRK with associate command line option.
- CMake build system added for all models.
- SYCL device check for FP64 support.
- New implementation using TBB.
- Compiler options for Fujitsu added to OpenMP.
### Changed
- Default branch renamed from `master` to `main`.

14
CITATION.cff Normal file
View File

@ -0,0 +1,14 @@
cff-version: 1.1.0
message: If you use this software, please cite it as below.
authors:
- family-names: Deakin
given-names: Tom
affiliation: University of Bristol
website: https://hpc.tomdeakin.com
- family-names: McIntosh-Smith
given-names: Simon
affiliation: University of Bristol
website: https://uob-hpc.github.io
title: BabelStream
version: 3.4
date-released: 2019-04-10

View File

@ -0,0 +1,2 @@
indent = 2
margin = 100

5
JuliaStream.jl/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
/docs/build/
/docs/Manifest.toml

View File

@ -0,0 +1,415 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"]
git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.12"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "7.0.1"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[HIP_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCmCompilerSupport_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[XSLT_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
version = "1.1.34+0"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.0.0+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,316 @@
# This file is machine-generated - editing it directly is not advised
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.4.1"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.0.2"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,547 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"]
git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.12"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "5e696e37e51b01ae07bd9f700afe6cbd55250bce"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.3.4"
[[CUDAKernels]]
deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b"
uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
version = "0.3.0"
[[Cassette]]
git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.3.8"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.10"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "7.0.1"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[HIP_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[KernelAbstractions]]
deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.7.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCKernels]]
deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b"
uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
version = "0.2.0"
[[ROCmCompilerSupport_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.2.12"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[XSLT_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
version = "1.1.34+0"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.0.0+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,11 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
[compat]
julia = "1.6"

View File

@ -0,0 +1,493 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.7"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.2.1"
[[CUDAKernels]]
deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b"
uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
version = "0.3.0"
[[Cassette]]
git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.3.8"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.10"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "6.4.1"
[[GPUCompiler]]
deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.11.5"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[KernelAbstractions]]
deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.7.0"
[[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "3.9.0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Memoize]]
deps = ["MacroTools"]
git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
version = "0.4.4"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.12.19358+0"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCKernels]]
deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b"
uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
version = "0.2.0"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[SPIRV_LLVM_Translator_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
version = "11.0.0+2"
[[SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0"
[[Scratch]]
deps = ["Dates"]
git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.1.0"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.2.12"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "20.3.2+0"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "3.7.0+1"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "3.8.0+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.6712+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "0.1.3"
[[oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.1.2+1"
[[oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.3.6+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,19 @@
name = "JuliaStream"
uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681"
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
version = "3.4.0"
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"

67
JuliaStream.jl/README.md Normal file
View File

@ -0,0 +1,67 @@
JuliaStream.jl
==============
This is an implementation of BabelStream in Julia which contains the following variants:
* `PlainStream.jl` - Single threaded `for`
* `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros
* `DistributedStream.jl` - Process based parallelism with `@distributed` macros
* `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
* `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
* `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl)
* `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
### Build & Run
Prerequisites
* Julia >= 1.6+
A set of reduced dependency projects are available for the following backend and implementations:
* `AMDGPU` supports:
- `AMDGPUStream.jl`
* `CUDA` supports:
- `CUDAStream.jl`
* `oneAPI` supports:
- `oneAPIStream.jl`
* `KernelAbstractions` supports:
- `KernelAbstractionsStream.jl`
* `Threaded` supports:
- `PlainStream.jl`
- `ThreadedStream.jl`
- `DistributedStream.jl`
With Julia on path, run your selected benchmark with:
```shell
> cd JuliaStream.jl
> julia --project=<BACKEND> -e 'import Pkg; Pkg.instantiate()' # only required on first run
> julia --project=<BACKEND> src/<IMPL>Stream.jl
```
For example. to run the CUDA implementation:
```shell
> cd JuliaStream.jl
> julia --project=CUDA -e 'import Pkg; Pkg.instantiate()'
> julia --project=CUDA src/CUDAStream.jl
```
**Important:**
* Julia is 1-indexed, so N >= 1 in `--device N`.
* Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1.
* Worker count for `DistributedStream` is set with `-p <N>` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing).
* Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform.
***
Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`.
There may be instances where some packages are locked to an older version because of transitive dependency requirements.
To run the benchmark using the top-level project, run the benchmark with:
```shell
> cd JuliaStream.jl
> julia --project -e 'import Pkg; Pkg.instantiate()'
> julia --project src/<IMPL>Stream.jl
```

View File

@ -0,0 +1,31 @@
# This file is machine-generated - editing it directly is not advised
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"

View File

@ -0,0 +1,6 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,319 @@
# This file is machine-generated - editing it directly is not advised
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.0.2"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "2bfc354b5684821dcc88f1e477cefd0dd03c60b5"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.31.20514+0"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[SPIRV_LLVM_Translator_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
version = "11.0.0+2"
[[SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "21.2.1+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "b30a895e7ea52991a3f984ab0302c42858d766c0"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.8173+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "92e8eefdd4694597994590230ab329545804bdb3"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "0.2.0"
[[oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "e1d123ff9ada6c469a1eaf57e33a74c3cb26a5a4"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.2.13+0"
[[oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "50124857f7e87420655929a9c8ca86749826af11"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.4.1+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"

View File

@ -0,0 +1,167 @@
# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs
include("Stream.jl")
using AMDGPU
const ROCData = StreamData{T,ROCArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
function devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), "AMDGPU.jl"), sorted)
catch
# probably unsupported
String[]
end
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{ROCData{T},Nothing} where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
# XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!()
# but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand
AMDGPU.DEFAULT_AGENT[] = device[1]
selected = AMDGPU.get_default_agent()
if !silent
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
println("Kernel parameters : <<<$(arraysize),$(TBSize)>>>")
end
return (
ROCData{T}(
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T}
AMDGPU.fill!(data.a, init[1])
AMDGPU.fill!(data.b, init[2])
AMDGPU.fill!(data.c, init[3])
end
function copy!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds c[i] = a[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c)
)
end
function mul!(data::ROCData{T}, _) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds b[i] = scalar * c[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar)
)
end
function add!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds c[i] = a[i] + b[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c)
)
end
function triad!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(
data.a,
data.b,
data.c,
data.scalar,
)
)
end
function nstream!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds a[i] += b[i] + scalar * c[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(
data.a,
data.b,
data.c,
data.scalar,
)
)
end
function dot(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize))
local_i = workitemIdx().x
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here
end
# then tree reduction
offset = workgroupDim().x ÷ 2
while offset > 0
sync_workgroup()
if (local_i - 1) < offset
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
end
offset ÷= 2
end
if (local_i == 1)
@inbounds partial[workgroupIdx().x] = tb_sum[local_i]
end
return
end
partial_sum = ROCArray{T}(undef, DotBlocks)
AMDGPU.wait(
@roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel(
data.a,
data.b,
data.size,
partial_sum,
)
)
return sum(partial_sum)
end
function read_data(data::ROCData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,152 @@
include("Stream.jl")
using CUDA
const CuData = StreamData{T,CuArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
function devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? String[] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices())
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{CuData{T},Nothing} where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
CUDA.device!(device[1])
selected = CUDA.device()
# show_reason is set to true here so it dumps CUDA info
# for us regardless of whether it's functional
if !CUDA.functional(true)
error("Non-functional CUDA configuration")
end
if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>")
end
return (
CuData{T}(
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
fill!(data.a, init[1])
fill!(data.b, init[2])
fill!(data.c, init[3])
end
function copy!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c)
CUDA.synchronize()
end
function mul!(data::CuData{T}, _) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds b[i] = scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar)
CUDA.synchronize()
end
function add!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i] + b[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c)
CUDA.synchronize()
end
function triad!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function nstream!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] += b[i] + scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function dot(data::CuData{T}, _) where {T}
# direct port of the reduction in CUDAStream.cu
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
tb_sum = @cuStaticSharedMem(T, TBSize)
local_i = threadIdx().x
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += blockDim().x * gridDim().x
end
# then tree reduction
offset = blockDim().x ÷ 2
while offset > 0
sync_threads()
if (local_i - 1) < offset
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
end
offset ÷= 2
end
if (local_i == 1)
@inbounds partial[blockIdx().x] = tb_sum[local_i]
end
return
end
partial_sum = CuArray{T}(undef, DotBlocks)
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
return sum(partial_sum)
end
function read_data(data::CuData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,85 @@
using Distributed
@everywhere using Pkg
@everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...`
@everywhere include("StreamData.jl")
@everywhere include("Stream.jl")
@everywhere using SharedArrays
@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T}
function devices()::Vector{DeviceWithRepr}
return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{SharedArrayData{T},Nothing} where {T}
if !silent
println("Using max $(nworkers()) process(es) + 1 master")
end
return (
SharedArrayData{T}(
SharedArray{T}(arraysize),
SharedArray{T}(arraysize),
SharedArray{T}(arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
function dot(data::SharedArrayData{T}, _) where {T}
return @distributed (+) for i = 1:data.size
@inbounds data.a[i] * data.b[i]
end
end
function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,4 @@
module JuliaStream
end
println("Please run benchmarks directly via `julia --project src/<IMPL>Stream.jl`")

View File

@ -0,0 +1,255 @@
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
include("Stream.jl")
const CuData = StreamData{T,CUDA.CuArray{T}} where {T}
const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
@enum Backend cuda rocm cpu
struct Context
backend::Backend
device::Device
end
function list_rocm_devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), rocm), sorted)
catch
# probably unsupported
[]
end
end
function list_cuda_devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? String[] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end
function devices()::Vector{DeviceWithRepr}
cudas = list_cuda_devices()
rocms = list_rocm_devices()
cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
vcat(cpus, cudas, rocms)
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
) where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
(selected, _, backend) = device
if backend == cpu
if !silent
println("Using CPU with max $(Threads.nthreads()) threads")
end
partialsum = Vector{T}(undef, DotBlocks)
data = VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = CPU()
elseif backend == cuda
CUDA.device!(selected)
if CUDA.device() != selected
error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())")
end
if !CUDA.functional(true)
error("Non-functional CUDA configuration")
end
if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
end
partialsum = CuArray{T}(undef, DotBlocks)
data = CuData{T}(
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = CUDADevice()
elseif backend == rocm
AMDGPU.DEFAULT_AGENT[] = selected
if AMDGPU.get_default_agent() != selected
error(
"Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())",
)
end
if !silent
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
end
partialsum = ROCArray{T}(undef, DotBlocks)
data = ROCData{T}(
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = ROCDevice()
else
error("unsupported backend $(backend)")
end
if !silent
println("Kernel parameters : <<<$(data.size),$(TBSize)>>>")
end
return (data, Context(backend, backenddevice))
end
function init_arrays!(
data::StreamData{T,C},
context::Context,
init::Tuple{T,T,T},
) where {T,C}
if context.backend == cpu
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
elseif context.backend == cuda
CUDA.fill!(data.a, init[1])
CUDA.fill!(data.b, init[2])
CUDA.fill!(data.c, init[3])
elseif context.backend == rocm
AMDGPU.fill!(data.a, init[1])
AMDGPU.fill!(data.b, init[2])
AMDGPU.fill!(data.c, init[3])
else
error("unsupported backend $(backend)")
end
end
function copy!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), c)
i = @index(Global)
@inbounds c[i] = a[i]
end
wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size))
end
function mul!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T)
i = @index(Global)
@inbounds b[i] = scalar * c[i]
end
wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size))
end
function add!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c)
i = @index(Global)
@inbounds c[i] = a[i] + b[i]
end
wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size))
end
function triad!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
i = @index(Global)
@inbounds a[i] = b[i] + (scalar * c[i])
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.c,
data.scalar,
ndrange = data.size,
),
)
end
function nstream!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
i = @index(Global)
@inbounds a[i] += b[i] + scalar * c[i]
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.c,
data.scalar,
ndrange = data.size,
),
)
end
function dot(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T})
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
# FIXME this does not compile when targeting CPUs:
# see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
if context.backend == cpu
partial_sum = Vector{T}(undef, DotBlocks)
elseif context.backend == cuda
partial_sum = CuArray{T}(undef, DotBlocks)
elseif context.backend == rocm
partial_sum = ROCArray{T}(undef, DotBlocks)
else
error("unsupported backend $(backend)")
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.size,
partial_sum,
ndrange = TBSize * DotBlocks,
),
)
return sum(partial_sum)
end
function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,75 @@
include("Stream.jl")
function devices()::Vector{DeviceWithRepr}
return [(undef, "CPU", "Palin")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{VectorData{T},Nothing} where {T}
return (
VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
),
nothing
)
end
function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T}
for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
function dot(data::VectorData{T}, _) where {T}
sum = zero(T)
for i = 1:data.size
@inbounds sum += data.a[i] * data.b[i]
end
return sum
end
function read_data(data::VectorData{T}, _)::VectorData{T} where {T}
return data
end
main()

View File

@ -0,0 +1,300 @@
using ArgParse
using Parameters
using Printf
using Base: Float64, Int
include("StreamData.jl")
const VectorData = StreamData{T,Vector{T}} where {T}
const DeviceWithRepr = Tuple{Any,String,Any}
struct Timings
copy::Vector{Float64}
mul::Vector{Float64}
add::Vector{Float64}
triad::Vector{Float64}
dot::Vector{Float64}
Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n))
end
@enum Benchmark All Triad Nstream
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
timings = Timings(times)
lastSum::T = 0
for i = 1:times
@inbounds timings.copy[i] = @elapsed copy!(data, context)
@inbounds timings.mul[i] = @elapsed mul!(data, context)
@inbounds timings.add[i] = @elapsed add!(data, context)
@inbounds timings.triad[i] = @elapsed triad!(data, context)
@inbounds timings.dot[i] = @elapsed lastSum = dot(data, context)
end
return (timings, lastSum)
end
function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {T,C}
return @elapsed for _ = 1:times
triad!(data, context)
end
end
function run_nstream!(
data::StreamData{T,C},
context,
times::Int,
)::Vector{Float64} where {T,C}
timings::Vector{Float64} = zeros(times)
for i = 1:times
@inbounds timings[i] = @elapsed nstream!(data, context)
end
return timings
end
function check_solutions(
data::StreamData{T,C},
times::Int,
init::Tuple{T,T,T},
benchmark::Benchmark,
dot::Union{T,Nothing},
) where {T,C}
(gold_a, gold_b, gold_c) = init
for _ = 1:times
if benchmark == All
gold_c = gold_a
gold_b = data.scalar * gold_c
gold_c = gold_a + gold_b
gold_a = gold_b + data.scalar * gold_c
elseif benchmark == Triad
gold_a = gold_b + data.scalar * gold_c
elseif benchmark == Nstream
gold_a += gold_b + data.scalar * gold_c
else
error("Unknown benchmark", benchmark)
end
end
tolerance = eps(T) * 100
function validate_xs(name::String, xs::AbstractArray{T}, from::T)
error = (map(x -> abs(x - from), xs) |> sum) / length(xs)
failed = error > tolerance
if failed
println("Validation failed on $name. Average error $error")
end
!failed
end
a_valid = validate_xs("a", data.a, gold_a)
b_valid = validate_xs("b", data.b, gold_b)
c_valid = validate_xs("c", data.c, gold_c)
dot_valid =
dot !== nothing ?
begin
gold_sum = gold_a * gold_b * data.size
error = abs((dot - gold_sum) / gold_sum)
failed = error > 1.0e-8
if failed
println(
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
)
end
!failed
end : true
a_valid && b_valid && c_valid && dot_valid
end
@with_kw mutable struct Config
list::Bool = false
device::Int = 1
numtimes::Int = 100
arraysize::Int = 33554432
float::Bool = false
triad_only::Bool = false
nstream_only::Bool = false
csv::Bool = false
mibibytes::Bool = false
end
function parse_options(given::Config)
s = ArgParseSettings()
@add_arg_table s begin
"--list"
help = "List available devices"
action = :store_true
"--device", "-d"
help = "Select device at DEVICE, NOTE: Julia is 1-indexed"
arg_type = Int
default = given.device
"--numtimes", "-n"
help = "Run the test NUMTIMES times (NUM >= 2)"
arg_type = Int
default = given.numtimes
"--arraysize", "-s"
help = "Use ARRAYSIZE elements in the array"
arg_type = Int
default = given.arraysize
"--float"
help = "Use floats (rather than doubles)"
action = :store_true
"--triad_only"
help = "Only run triad"
action = :store_true
"--nstream_only"
help = "Only run nstream"
action = :store_true
"--csv"
help = "Output as csv table"
action = :store_true
"--mibibytes"
help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
action = :store_true
end
args = parse_args(s)
# surely there's a better way than doing this:
for (arg, val) in args
setproperty!(given, Symbol(arg), val)
end
end
const DefaultInit = (0.1, 0.2, 0.0)
const DefaultScalar = 0.4
const Version = "3.4.0"
function main()
config::Config = Config()
parse_options(config)
if config.list
for (i, (_,repr, impl)) in enumerate(devices())
println("[$i] ($impl) $repr")
end
exit(0)
end
ds = devices()
# TODO implement substring device match
if config.device < 1 || config.device > length(ds)
error(
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
)
else
device = ds[config.device]
end
type = config.float ? Float32 : Float64
if config.nstream_only && !config.triad_only
benchmark = Nstream
elseif !config.nstream_only && config.triad_only
benchmark = Triad
elseif !config.nstream_only && !config.triad_only
benchmark = All
elseif config.nstream_only && config.triad_only
error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks")
else
error("Invalid config: $(repr(config))")
end
array_bytes = config.arraysize * sizeof(type)
total_bytes = array_bytes * 3
(mega_scale, mega_suffix, giga_scale, giga_suffix) =
!config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB")
if !config.csv
println("""BabelStream
Version: $Version
Implementation: Julia; $(PROGRAM_FILE)""")
println("Running kernels $(config.numtimes) times")
if benchmark == Triad
println("Number of elements: $(config.arraysize)")
end
println("Precision: $(config.float ? "float" : "double")")
r1 = n -> round(n; digits = 1)
println(
"Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)",
)
println(
"Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)",
)
end
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
tail = Base.rest(xs)
min = Iterators.minimum(tail)
max = Iterators.maximum(tail)
avg = Iterators.sum(tail) / Iterators.length(tail)
mbps = mega_scale * total_bytes / min
if config.csv
return [
("function", name),
("num_times", config.numtimes),
("n_elements", config.arraysize),
("sizeof", total_bytes),
("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps),
("min_runtime", min),
("max_runtime", max),
("avg_runtime", avg),
]
else
return [
("Function", name),
("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)),
("Min (sec)", round(min; digits = 5)),
("Max", round(max; digits = 5)),
("Average", round(avg; digits = 5)),
]
end
end
function tabulate(rows::Vector{Tuple{String,Any}}...)
header = Base.first(rows)
padding = config.csv ? 0 : 12
sep = config.csv ? "," : ""
map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println
for row in rows
map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println
end
end
init::Tuple{type,type,type} = DefaultInit
scalar::type = DefaultScalar
GC.enable(false)
(data, context) = make_stream(config.arraysize, scalar, device, config.csv)
init_arrays!(data, context, init)
if benchmark == All
(timings, sum) = run_all!(data, context, config.numtimes)
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum)
tabulate(
mk_row(timings.copy, "Copy", 2 * array_bytes),
mk_row(timings.mul, "Mul", 2 * array_bytes),
mk_row(timings.add, "Add", 3 * array_bytes),
mk_row(timings.triad, "Triad", 3 * array_bytes),
mk_row(timings.dot, "Dot", 2 * array_bytes),
)
elseif benchmark == Nstream
timings = run_nstream!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
elseif benchmark == Triad
elapsed = run_triad!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
total_bytes = 3 * array_bytes * config.numtimes
bandwidth = mega_scale * (total_bytes / elapsed)
println("Runtime (seconds): $(round(elapsed; digits=5))")
println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ")
else
error("Bad benchmark $(benchmark)")
end
GC.enable(true)
if !valid
exit(1)
end
end

View File

@ -0,0 +1,7 @@
struct StreamData{T,C<:AbstractArray{T}}
a::C
b::C
c::C
scalar::T
size::Int
end

View File

@ -0,0 +1,112 @@
include("Stream.jl")
function devices()::Vector{DeviceWithRepr}
return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{VectorData{T},Nothing} where {T}
if !silent
println("Using max $(Threads.nthreads()) threads")
end
return (
VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
),
nothing
)
end
function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc
function static_par_ranged(f::Function, range::Int, n::Int)
stride = range ÷ n
rem = range % n
strides = map(0:n) do i
width = stride + (i < rem ? 1 : 0)
offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem))
(offset, width)
end
ccall(:jl_enter_threaded_region, Cvoid, ())
try
foreach(wait, map(1:n) do group
(offset, size) = strides[group]
task = Task(() -> f(group, offset+1, offset+size))
task.sticky = true
ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group
schedule(task)
end)
finally
ccall(:jl_exit_threaded_region, Cvoid, ())
end
end
function dot(data::VectorData{T}, _) where {T}
partial = Vector{T}(undef, Threads.nthreads())
static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx
acc = zero(T)
@simd for i = startidx:endidx
@inbounds acc += data.a[i] * data.b[i]
end
@inbounds partial[group] = acc
end
return sum(partial)
# This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall
# and inhibited vectorisation from the lack of @simd
# partial = zeros(T, Threads.nthreads())
# Threads.@threads for i = 1:data.size
# @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i])
# end
# return sum(partial)
end
function read_data(data::VectorData{T}, _)::VectorData{T} where {T}
return data
end
main()

View File

@ -0,0 +1,170 @@
using Base.Iterators: println
using Base.Iterators: println
using Printf: Iterators
include("Stream.jl")
using oneAPI
const oneData = StreamData{T,oneArray{T}} where {T}
const DotWGSize = 256::Int
function devices()::Vector{DeviceWithRepr}
all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect
map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all)
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{oneData{T},Int} where {T}
oneAPI.allowscalar(false)
oneAPI.device!(device[1])
props = oneL0.compute_properties(oneAPI.device())
groupsize = min(props.maxTotalGroupSize, arraysize)
if arraysize % groupsize != 0
error("arraysize ($(arraysize)) must be divisible by $(groupsize)!")
end
if !silent
println("Using L0 device: $(repr("text/plain",device[1]))")
println("Kernel parameters : <<<$(arraysize),$(groupsize)>>>")
end
return (
oneData{T}(
oneArray{T}(undef, arraysize),
oneArray{T}(undef, arraysize),
oneArray{T}(undef, arraysize),
scalar,
arraysize,
),
groupsize,
)
end
function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T}
oneAPI.fill!(data.a, init[1])
oneAPI.fill!(data.b, init[2])
oneAPI.fill!(data.c, init[3])
end
function copy!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = get_global_id()
@inbounds c[i] = a[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.c,
)
oneAPI.synchronize()
end
function mul!(data::oneData{T}, groupsize::Int) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds b[i] = scalar * c[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function add!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = get_global_id()
@inbounds c[i] = a[i] + b[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
)
oneAPI.synchronize()
end
function triad!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function nstream!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds a[i] += b[i] + scalar * c[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function dot(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
wg_sum = @LocalMemory(T, (DotWGSize,))
li = get_local_id()
@inbounds wg_sum[li] = 0.0
# do dot first
i = get_global_id()
while i <= size
@inbounds wg_sum[li] += a[i] * b[i]
i += get_global_size()
end
# then tree reduction
offset = get_local_size() ÷ 2
while offset > 0
barrier()
if li <= offset
@inbounds wg_sum[li] += wg_sum[li+offset]
end
offset ÷= 2
end
if li == 1
@inbounds partial[get_group_id()] = wg_sum[li]
end
return
end
partial_sum = oneArray{T}(undef, groupsize)
@oneapi items = groupsize groups = DotWGSize kernel(
data.a,
data.b,
data.size,
partial_sum,
)
oneAPI.synchronize()
return sum(partial_sum)
end
function read_data(data::oneData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

7
JuliaStream.jl/update_all.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
# shellcheck disable=SC2034 disable=SC2153
for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions"
do
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
done

View File

@ -19,9 +19,12 @@ Currently implemented are:
- Kokkos
- RAJA
- SYCL
- TBB
This code was previously called GPU-STREAM.
This project also contains implementations in alternative languages with different build systems:
* Scala - [scala-stream](./scala-stream)
How is this different to STREAM?
--------------------------------
@ -92,7 +95,7 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja
-- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja;tbb
-- Selected model : ocl
-- Supported flags:
@ -160,10 +163,12 @@ Citing
Please cite BabelStream via this reference:
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
**Other BabelStream publications:**
> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018.DOI: 10.1504/IJCSE.2018.095847
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
@ -172,8 +177,6 @@ You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/s
> Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017.
> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue (in press). 2017.
> Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States.
You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html)

29
TBB.cmake Normal file
View File

@ -0,0 +1,29 @@
register_flag_optional(ONE_TBB_DIR
"Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
"")
register_flag_optional(PARTITIONER
"Partitioner specifies how a loop template should partition its work among threads.
Possible values are:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
"AUTO")
macro(setup)
if(ONE_TBB_DIR)
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
# docs on Intel's website refers to TBB_DIR which is not correct
endif()
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
register_definitions(PARTITIONER_${PARTITIONER})
endmacro()

56
TBB.make Normal file
View File

@ -0,0 +1,56 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL INTEL_LEGACY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_GNU = g++
CXX_INTEL = icpx
CXX_INTEL_LEGACY = icpc
CXX = $(COMPILER_$(COMPILER))
CXXFLAGS_GNU = -march=native
CXXFLAGS_INTEL = -march=native
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
ifndef PARTITIONER
define partitioner_help
Set PARTITIONER to select TBB's partitioner.
Partitioner specifies how a loop template should partition its work among threads.
Available options:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
for more details.
endef
$(info $(partitioner_help))
PARTITIONER=AUTO
endif
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

134
TBBStream.cpp Normal file
View File

@ -0,0 +1,134 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "TBBStream.hpp"
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
{
if(device != 0){
throw std::runtime_error("Device != 0 is not supported by TBB");
}
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
}
template <class T>
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
}, partitioner);
}
template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
}
template <class T>
void TBBStream<T>::copy()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::mul()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
b[i] = scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::add()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i] + b[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::triad()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::nstream()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] += b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
T TBBStream<T>::dot()
{
// sum += a[i] * b[i];
return
tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
for (size_t i = r.begin(); i < r.end(); ++i) {
acc += a[i] * b[i];
}
return acc;
}, std::plus<T>(), partitioner);
}
void listDevices(void)
{
std::cout << "Listing devices is not supported by TBB" << std::endl;
}
std::string getDeviceName(const int device)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class TBBStream<float>;
template class TBBStream<double>;

62
TBBStream.hpp Normal file
View File

@ -0,0 +1,62 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include "tbb/tbb.h"
#include "Stream.h"
#define IMPLEMENTATION_STRING "TBB"
#if defined(PARTITIONER_AUTO)
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#elif defined(PARTITIONER_AFFINITY)
using tbb_partitioner = tbb::affinity_partitioner;
#define PARTITIONER_NAME "affinity_partitioner"
#elif defined(PARTITIONER_STATIC)
using tbb_partitioner = tbb::static_partitioner;
#define PARTITIONER_NAME "static_partitioner"
#elif defined(PARTITIONER_SIMPLE)
using tbb_partitioner = tbb::simple_partitioner;
#define PARTITIONER_NAME "simple_partitioner"
#else
// default to auto
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#endif
template <class T>
class TBBStream : public Stream<T>
{
protected:
tbb_partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
public:
TBBStream(const int, int);
~TBBStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

91
THRUST.cmake Normal file
View File

@ -0,0 +1,91 @@
register_flag_optional(THRUST_IMPL
"Which Thrust implementation to use, supported options include:
- CUDA (via https://github.com/NVIDIA/thrust)
- ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
"
"CUDA")
register_flag_optional(SDK_DIR
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
"")
register_flag_optional(BACKEND
"[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
- CUDA
- OMP
- TBB
"
"CUDA")
register_flag_optional(CMAKE_CUDA_COMPILER
"[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
"")
# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
register_flag_optional(CUDA_ARCH
"[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")
register_flag_optional(CUDA_EXTRA_FLAGS
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")
macro(setup)
set(CMAKE_CXX_STANDARD 14)
if (${THRUST_IMPL} STREQUAL "CUDA")
# see CUDA.cmake, we're only adding a few Thrust related libraries here
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
if (SDK_DIR)
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
endif ()
message(STATUS "Using Thrust backend: ${BACKEND}")
# this creates the interface that we can link to
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
register_link_library(Thrust)
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
else ()
find_package(rocprim REQUIRED CONFIG)
find_package(rocthrust REQUIRED CONFIG)
endif ()
# for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them
set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX)
register_link_library(roc::rocthrust)
else ()
message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
endif ()
endmacro()

235
ThrustStream.cu Normal file
View File

@ -0,0 +1,235 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "ThrustStream.h"
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
static inline void synchronise()
{
// rocThrust doesn't synchronise between thrust calls
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
hipDeviceSynchronize();
#endif
}
template <class T>
ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
#if THRUST_DEVICE_SYSTEM == 0
// as per Thrust docs, 0 is reserved for undefined backend
std::cout << "Thrust backend: undefined" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
std::cout << "Thrust backend: CUDA" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
std::cout << "Thrust backend: OMP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
std::cout << "Thrust backend: CPP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#else
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
std::cout << "Thrust backend: HIP" << std::endl;
#else
std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
#endif
#endif
}
template <class T>
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
{
thrust::fill(a.begin(), a.end(), initA);
thrust::fill(b.begin(), b.end(), initB);
thrust::fill(c.begin(), c.end(), initC);
synchronise();
}
template <class T>
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
thrust::copy(a.begin(), a.end(), h_a.begin());
thrust::copy(b.begin(), b.end(), h_b.begin());
thrust::copy(c.begin(), c.end(), h_c.begin());
}
template <class T>
void ThrustStream<T>::copy()
{
thrust::copy(a.begin(), a.end(),c.begin());
synchronise();
}
template <class T>
void ThrustStream<T>::mul()
{
const T scalar = startScalar;
thrust::transform(
c.begin(),
c.end(),
b.begin(),
[=] __device__ __host__ (const T &ci){
return ci * scalar;
}
);
synchronise();
}
template <class T>
void ThrustStream<T>::add()
{
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
c.begin(),
thrust::make_zip_function(
[] __device__ __host__ (const T& ai, const T& bi){
return ai + bi;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::triad()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& bi, const T& ci){
return bi + scalar * ci;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::nstream()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
return ai + bi + scalar * ci;
})
);
synchronise();
}
template <class T>
T ThrustStream<T>::dot()
{
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
}
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)
#define IMPL_FN__(fn) hip ## fn
#define IMPL_TYPE__(tpe) hip ## tpe ## _t
#else
# error Unsupported compiler for Thrust
#endif
void check_error(void)
{
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
if (err != IMPL_FN__(Success))
{
std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl;
exit(err);
}
}
void listDevices(void)
{
// Get number of devices
int count;
IMPL_FN__(GetDeviceCount(&count));
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
IMPL_TYPE__(DeviceProp) props = {};
IMPL_FN__(GetDeviceProperties(&props, device));
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
IMPL_FN__(SetDevice(device));
check_error();
int driver;
IMPL_FN__(DriverGetVersion(&driver));
check_error();
return std::to_string(driver);
}
#undef IMPL_FN__
#undef IMPL_TPE__
#else
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("(device name unavailable)");
}
std::string getDeviceDriver(const int)
{
return std::string("(device driver unavailable)");
}
#endif
template class ThrustStream<float>;
template class ThrustStream<double>;

43
ThrustStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include <thrust/device_vector.h>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Thrust"
template <class T>
class ThrustStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
thrust::device_vector<T> a;
thrust::device_vector<T> b;
thrust::device_vector<T> c;
public:
ThrustStream(const int, int);
~ThrustStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

128
java-stream/.gitignore vendored Normal file
View File

@ -0,0 +1,128 @@
## File-based project format:
.idea
*.iws
*.iml
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
### VisualStudioCode template
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### Linux template
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# Windows thumbnail cache files
Thumbs.db
ehthumbs.db
ehthumbs_vista.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msm
*.msp
# Windows shortcuts
*.lnk
### Maven template
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
!/.mvn/wrapper/maven-wrapper.jar
### Java template
# Compiled class file
*.class
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### macOS template
*.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
!.mvn/**/*
settings.xml

Binary file not shown.

View File

@ -0,0 +1 @@
distributionUrl=https://repo1.maven.org/maven2/org/apache/maven/apache-maven/3.5.0/apache-maven-3.5.0-bin.zip

172
java-stream/README.md Normal file
View File

@ -0,0 +1,172 @@
java-stream
===========
This is an implementation of BabelStream in Java 8 which contains the following implementations:
* `jdk-plain` - Single threaded `for`
* `jdk-stream` - Threaded implementation using JDK8's parallel stream API
* `tornadovm` - A [TornadoVM](https://github.com/beehive-lab/TornadoVM) implementation for
PTX/OpenCL
* `aparapi` - A [Aparapi](https://git.qoto.org/aparapi/aparapi) implementation for OpenCL
### Build & Run
Prerequisites
* JDK >= 8
To run the benchmark, first create a binary:
```shell
> cd java-stream
> ./mvnw clean package
```
The binary will be located at `./target/java-stream.jar`. Run it with:
```shell
> java -version  ✔  11.0.11+9 ☕  tom@soraws-uk  05:03:20
openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05)
OpenJDK 64-Bit Server VM GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05, mixed mode)
> java -jar target/java-stream.jar --help
```
For best results, benchmark with the following JVM flags:
```
-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance
-XX:-TieredCompilation # disable C1, go straight to C2
-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point
```
Worked example:
```shell
> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/java-stream.jar
BabelStream
Version: 3.4
Implementation: jdk-stream; (Java 11.0.11;Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-4.fc33.x86_64)
Running all 100 times
Precision: double
Array size: 268.4 MB (=0.3 GB)
Total size: 805.3 MB (=0.8 GB)
Function MBytes/sec Min (sec) Max Average
Copy 17145.538 0.03131 0.04779 0.03413
Mul 16759.092 0.03203 0.04752 0.03579
Add 19431.954 0.04144 0.05866 0.04503
Triad 19763.970 0.04075 0.05388 0.04510
Dot 26646.894 0.02015 0.03013 0.02259
```
If your OpenCL/CUDA installation is not at the default location, TornadoVM and Aparapi may fail to
detect your devices. In those cases, you may specify the library directly, for example:
```shell
> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 java -jar target/java-stream.jar ...
```
### Instructions for TornadoVM
The TornadoVM implementation requires you to run the binary with a patched JVM. Follow the
official [instructions](https://github.com/beehive-lab/TornadoVM/blob/master/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md)
or use the following simplified instructions:
Prerequisites
* CMake >= 3.6
* GCC or clang/LLVM (GCC >= 5.5)
* Python >= 2.7
* Maven >= 3.6.3
* OpenCL headers >= 1.2 and/or CUDA SDK >= 9.0
First, get a copy of the TornadoVM source:
```shell
> cd
> git clone https://github.com/beehive-lab/TornadoVM tornadovm
```
Take note of the required GraalVM version
in `tornadovm/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md`. We'll use `21.1.0` in this example.
Now, obtain a copy of GraalVM and make sure the version matches the one required by TornadoVM:
```shell
> wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-21.1.0/graalvm-ce-java11-linux-amd64-21.1.0.tar.gz
> tar -xf graalvm-ce-java11-linux-amd64-21.1.0.tar.gz
```
Next, create `~/tornadovm/etc/sources.env` and populate the file with the following:
```shell
#!/bin/bash
export JAVA_HOME=<path to GraalVM 21.1.0 jdk>
export PATH=$PWD/bin/bin:$PATH
export TORNADO_SDK=$PWD/bin/sdk
export CMAKE_ROOT=/usr # path to CMake binary
```
Proceed to compile TornadoVM:
```shell
> cd ~/tornadovm
> . etc/sources.env
> make graal-jdk-11-plus BACKEND={ptx,opencl}
```
To test your build, source the environment file:
```shell
> source ~/tornadovm/etc/sources.env
> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 tornado --devices
Number of Tornado drivers: 1
Total number of OpenCL devices : 3
Tornado device=0:0
AMD Accelerated Parallel Processing -- gfx1012
Global Memory Size: 4.0 GB
Local Memory Size: 64.0 KB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [1024, 1024, 1024]
Device OpenCL C version: OpenCL C 2.0
Tornado device=0:1
Portable Computing Language -- pthread-AMD Ryzen 9 3900X 12-Core Processor
Global Memory Size: 60.7 GB
Local Memory Size: 8.0 MB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [4096, 4096, 4096]
Device OpenCL C version: OpenCL C 1.2 pocl
Tornado device=0:2
NVIDIA CUDA -- NVIDIA GeForce GT 710
Global Memory Size: 981.3 MB
Local Memory Size: 48.0 KB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [1024, 1024, 64]
Device OpenCL C version: OpenCL C 1.2
```
You can now use TornadoVM to run java-stream:
```shell
> tornado -jar ~/java-stream/target/java-stream.jar --impl tornadovm --arraysize 65536  1 ✘  11.0.11+9 ☕  tom@soraws-uk  05:31:34
BabelStream
Version: 3.4
Implementation: tornadovm; (Java 11.0.11;GraalVM Community; home=~/graalvm-ce-java11-21.1.0)
Running all 100 times
Precision: double
Array size: 0.5 MB (=0.0 GB)
Total size: 1.6 MB (=0.0 GB)
Using TornadoVM device:
- Name : NVIDIA GeForce GT 710 CL_DEVICE_TYPE_GPU (available)
- Id : opencl-0-0
- Platform : NVIDIA CUDA
- Backend : OpenCL
Function MBytes/sec Min (sec) Max Average
Copy 8791.100 0.00012 0.00079 0.00015
Mul 8774.107 0.00012 0.00061 0.00014
Add 9903.313 0.00016 0.00030 0.00018
Triad 9861.031 0.00016 0.00030 0.00018
Dot 2799.465 0.00037 0.00056 0.00041
```

225
java-stream/mvnw vendored Executable file
View File

@ -0,0 +1,225 @@
#!/bin/sh
# ----------------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Maven2 Start Up Batch script
#
# Required ENV vars:
# ------------------
# JAVA_HOME - location of a JDK home dir
#
# Optional ENV vars
# -----------------
# M2_HOME - location of maven2's installed home dir
# MAVEN_OPTS - parameters passed to the Java VM when running Maven
# e.g. to debug Maven itself, use
# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
# MAVEN_SKIP_RC - flag to disable loading of mavenrc files
# ----------------------------------------------------------------------------
if [ -z "$MAVEN_SKIP_RC" ] ; then
if [ -f /etc/mavenrc ] ; then
. /etc/mavenrc
fi
if [ -f "$HOME/.mavenrc" ] ; then
. "$HOME/.mavenrc"
fi
fi
# OS specific support. $var _must_ be set to either true or false.
cygwin=false;
darwin=false;
mingw=false
case "`uname`" in
CYGWIN*) cygwin=true ;;
MINGW*) mingw=true;;
Darwin*) darwin=true
# Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
# See https://developer.apple.com/library/mac/qa/qa1170/_index.html
if [ -z "$JAVA_HOME" ]; then
if [ -x "/usr/libexec/java_home" ]; then
export JAVA_HOME="`/usr/libexec/java_home`"
else
export JAVA_HOME="/Library/Java/Home"
fi
fi
;;
esac
if [ -z "$JAVA_HOME" ] ; then
if [ -r /etc/gentoo-release ] ; then
JAVA_HOME=`java-config --jre-home`
fi
fi
if [ -z "$M2_HOME" ] ; then
## resolve links - $0 may be a link to maven's home
PRG="$0"
# need this for relative symlinks
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG="`dirname "$PRG"`/$link"
fi
done
saveddir=`pwd`
M2_HOME=`dirname "$PRG"`/..
# make it fully qualified
M2_HOME=`cd "$M2_HOME" && pwd`
cd "$saveddir"
# echo Using m2 at $M2_HOME
fi
# For Cygwin, ensure paths are in UNIX format before anything is touched
if $cygwin ; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --unix "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
fi
# For Migwn, ensure paths are in UNIX format before anything is touched
if $mingw ; then
[ -n "$M2_HOME" ] &&
M2_HOME="`(cd "$M2_HOME"; pwd)`"
[ -n "$JAVA_HOME" ] &&
JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
# TODO classpath?
fi
if [ -z "$JAVA_HOME" ]; then
javaExecutable="`which javac`"
if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
# readlink(1) is not available as standard on Solaris 10.
readLink=`which readlink`
if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
if $darwin ; then
javaHome="`dirname \"$javaExecutable\"`"
javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
else
javaExecutable="`readlink -f \"$javaExecutable\"`"
fi
javaHome="`dirname \"$javaExecutable\"`"
javaHome=`expr "$javaHome" : '\(.*\)/bin'`
JAVA_HOME="$javaHome"
export JAVA_HOME
fi
fi
fi
if [ -z "$JAVACMD" ] ; then
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
else
JAVACMD="`which java`"
fi
fi
if [ ! -x "$JAVACMD" ] ; then
echo "Error: JAVA_HOME is not defined correctly." >&2
echo " We cannot execute $JAVACMD" >&2
exit 1
fi
if [ -z "$JAVA_HOME" ] ; then
echo "Warning: JAVA_HOME environment variable is not set."
fi
CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
# traverses directory structure from process work directory to filesystem root
# first directory with .mvn subdirectory is considered project base directory
find_maven_basedir() {
if [ -z "$1" ]
then
echo "Path not specified to find_maven_basedir"
return 1
fi
basedir="$1"
wdir="$1"
while [ "$wdir" != '/' ] ; do
if [ -d "$wdir"/.mvn ] ; then
basedir=$wdir
break
fi
# workaround for JBEAP-8937 (on Solaris 10/Sparc)
if [ -d "${wdir}" ]; then
wdir=`cd "$wdir/.."; pwd`
fi
# end of workaround
done
echo "${basedir}"
}
# concatenates all lines of a file
concat_lines() {
if [ -f "$1" ]; then
echo "$(tr -s '\n' ' ' < "$1")"
fi
}
BASE_DIR=`find_maven_basedir "$(pwd)"`
if [ -z "$BASE_DIR" ]; then
exit 1;
fi
export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}
echo $MAVEN_PROJECTBASEDIR
MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
# For Cygwin, switch paths to Windows format before running java
if $cygwin; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --path --windows "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
[ -n "$MAVEN_PROJECTBASEDIR" ] &&
MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"`
fi
WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
exec "$JAVACMD" \
$MAVEN_OPTS \
-classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
"-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"

143
java-stream/mvnw.cmd vendored Normal file
View File

@ -0,0 +1,143 @@
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM http://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Maven2 Start Up Batch script
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM M2_HOME - location of maven2's installed home dir
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM enable echoing my setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
%MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%" == "on" pause
if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
exit /B %ERROR_CODE%

133
java-stream/pom.xml Normal file
View File

@ -0,0 +1,133 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>java-stream</artifactId>
<groupId>javastream</groupId>
<version>3.4.0</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<junit.version>5.7.2</junit.version>
</properties>
<repositories>
<repository>
<id>universityOfManchester-graal</id>
<url>https://raw.githubusercontent.com/beehive-lab/tornado/maven-tornadovm</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.81</version>
</dependency>
<dependency>
<groupId>tornado</groupId>
<artifactId>tornado-api</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>com.aparapi</groupId>
<artifactId>aparapi</artifactId>
<version>2.0.0</version>
<exclusions>
<!-- don't pull in the entire Scala ecosystem! -->
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<target>1.8</target>
<source>1.8</source>
<compilerArgument>-Xlint:all</compilerArgument>
<showWarnings>true</showWarnings>
<showDeprecation>true</showDeprecation>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M5</version>
</plugin>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>javastream.Main</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.MF</exclude>
</excludes>
</filter>
</filters>
<finalName>${project.artifactId}</finalName>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.coveo</groupId>
<artifactId>fmt-maven-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<goals>
<goal>format</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,45 @@
package javastream;
/**
* This class represents our Fractional typeclass. Java's type system isn't unified so we have to do
* insane things for parametric operations on fractional types.
*/
@SuppressWarnings("unchecked")
public final class FractionalMaths {
private FractionalMaths() {
throw new AssertionError();
}
public static <T extends Number> T from(Class<T> evidence, Number n) {
if (evidence == Double.TYPE || evidence == Double.class)
return (T) Double.valueOf(n.doubleValue());
else if (evidence == Float.TYPE || evidence == Float.class)
return (T) Float.valueOf(n.floatValue());
throw new IllegalArgumentException();
}
public static <T extends Number> T plus(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() + y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() + y.floatValue());
throw new IllegalArgumentException();
}
static <T extends Number> T minus(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() - y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() - y.floatValue());
throw new IllegalArgumentException();
}
public static <T extends Number> T times(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() * y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() * y.floatValue());
throw new IllegalArgumentException();
}
static <T extends Number> T divide(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() / y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() / y.floatValue());
throw new IllegalArgumentException();
}
}

View File

@ -0,0 +1,172 @@
package javastream;
import java.time.Duration;
import java.util.AbstractMap;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.Main.Config;
public abstract class JavaStream<T> {
public static final class Data<T> {
final T[] a, b, c;
public Data(T[] a, T[] b, T[] c) {
this.a = Objects.requireNonNull(a);
this.b = Objects.requireNonNull(b);
this.c = Objects.requireNonNull(c);
}
}
static final class Timings<T> {
final List<T> copy = new ArrayList<>();
final List<T> mul = new ArrayList<>();
final List<T> add = new ArrayList<>();
final List<T> triad = new ArrayList<>();
final List<T> dot = new ArrayList<>();
}
protected final Config<T> config;
protected JavaStream(Config<T> config) {
this.config = config;
}
protected abstract List<String> listDevices();
protected abstract void initArrays();
protected abstract void copy();
protected abstract void mul();
protected abstract void add();
protected abstract void triad();
protected abstract void nstream();
protected abstract T dot();
protected abstract Data<T> data();
public static class EnumeratedStream<T> extends JavaStream<T> {
protected final JavaStream<T> actual;
private final Entry<String, Function<Config<T>, JavaStream<T>>>[] options;
@SafeVarargs
@SuppressWarnings("varargs")
public EnumeratedStream(
Config<T> config, Entry<String, Function<Config<T>, JavaStream<T>>>... options) {
super(config);
this.actual = options[config.options.device].getValue().apply(config);
this.options = options;
}
@Override
protected List<String> listDevices() {
return Arrays.stream(options).map(Entry::getKey).collect(Collectors.toList());
}
@Override
public void initArrays() {
actual.initArrays();
}
@Override
public void copy() {
actual.copy();
}
@Override
public void mul() {
actual.mul();
}
@Override
public void add() {
actual.add();
}
@Override
public void triad() {
actual.triad();
}
@Override
public void nstream() {
actual.nstream();
}
@Override
public T dot() {
return actual.dot();
}
@Override
public Data<T> data() {
return actual.data();
}
}
public static Double[] boxed(double[] xs) {
return Arrays.stream(xs).boxed().toArray(Double[]::new);
}
public static Float[] boxed(float[] xs) {
return IntStream.range(0, xs.length).mapToObj(i -> xs[i]).toArray(Float[]::new);
}
private static <T> AbstractMap.SimpleImmutableEntry<Duration, T> timed(Supplier<T> f) {
long start = System.nanoTime();
T r = f.get();
long end = System.nanoTime();
return new AbstractMap.SimpleImmutableEntry<>(Duration.ofNanos(end - start), r);
}
private static Duration timed(Runnable f) {
long start = System.nanoTime();
f.run();
long end = System.nanoTime();
return Duration.ofNanos(end - start);
}
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
Timings<Duration> timings = new Timings<>();
T lastSum = null;
for (int i = 0; i < times; i++) {
timings.copy.add(timed(this::copy));
timings.mul.add(timed(this::mul));
timings.add.add(timed(this::add));
timings.triad.add(timed(this::triad));
SimpleImmutableEntry<Duration, T> dot = timed(this::dot);
timings.dot.add(dot.getKey());
lastSum = dot.getValue();
}
return new SimpleImmutableEntry<>(timings, lastSum);
}
final Duration runTriad(int times) {
return timed(
() -> {
for (int i = 0; i < times; i++) {
triad();
}
});
}
final List<Duration> runNStream(int times) {
return IntStream.range(0, times)
.mapToObj(i -> timed(this::nstream))
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,425 @@
package javastream;
import static javastream.FractionalMaths.divide;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.minus;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import java.time.Duration;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.Arrays;
import java.util.DoubleSummaryStatistics;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;
import javastream.JavaStream.Data;
import javastream.JavaStream.Timings;
import javastream.aparapi.AparapiStreams;
import javastream.jdk.JdkStreams;
import javastream.jdk.PlainStream;
import javastream.tornadovm.TornadoVMStreams;
public class Main {
enum Benchmark {
NSTREAM,
TRIAD,
ALL
}
public static class Options {
@Parameter(names = "--list", description = "List available devices for all implementations")
boolean list = false;
@Parameter(
names = "--device",
description = "Select device at <device>, see --list for options")
public int device = 0;
@Parameter(
names = "--impl",
description = "Select implementation at <impl>, see --list for options")
public String impl = "";
@Parameter(
names = {"--numtimes", "-n"},
description = "Run the test <numtimes> times (NUM >= 2)")
public int numtimes = 100;
@Parameter(
names = {"--arraysize", "-s"},
description = "Use <arraysize> elements in the array")
public int arraysize = 33554432;
@Parameter(names = "--float", description = "Use floats (rather than doubles)")
public boolean useFloat = false;
@Parameter(names = "--triad-only", description = "Only run triad")
public boolean triadOnly = false;
@Parameter(names = "--nstream-only", description = "Only run nstream")
public boolean nstreamOnly = false;
@Parameter(names = "--csv", description = "Output as csv table")
public boolean csv = false;
@Parameter(
names = "--mibibytes",
description = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)")
public boolean mibibytes = false;
@Parameter(names = "--dot-tolerance", description = "Tolerance for dot kernel verification")
public double dotTolerance = 1.0e-8;
public boolean isVerboseBenchmark() {
return !list && !csv;
}
}
public static final class Config<T> {
public final Options options;
public final Benchmark benchmark;
public final int typeSize;
public final Class<T> evidence;
public final T ulp, scalar, initA, initB, initC;
public Config(
Options options,
Benchmark benchmark,
int typeSize,
Class<T> evidence,
T ulp,
T scalar,
T initA,
T initB,
T initC) {
this.options = Objects.requireNonNull(options);
this.benchmark = Objects.requireNonNull(benchmark);
this.typeSize = typeSize;
this.evidence = Objects.requireNonNull(evidence);
this.ulp = Objects.requireNonNull(ulp);
this.scalar = Objects.requireNonNull(scalar);
this.initA = Objects.requireNonNull(initA);
this.initB = Objects.requireNonNull(initB);
this.initC = Objects.requireNonNull(initC);
}
}
static final class Implementation {
final String name;
final Function<Config<Float>, JavaStream<Float>> makeFloat;
final Function<Config<Double>, JavaStream<Double>> makeDouble;
Implementation(
String name,
Function<Config<Float>, JavaStream<Float>> makeFloat,
Function<Config<Double>, JavaStream<Double>> makeDouble) {
this.name = Objects.requireNonNull(name);
this.makeFloat = Objects.requireNonNull(makeFloat);
this.makeDouble = Objects.requireNonNull(makeDouble);
}
}
static <T extends Number> boolean run(
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
Options opt = config.options;
int arrayBytes = opt.arraysize * config.typeSize;
int totalBytes = arrayBytes * 3;
String megaSuffix = opt.mibibytes ? "MiB" : "MB";
String gigaSuffix = opt.mibibytes ? "GiB" : "GB";
double megaScale = opt.mibibytes ? Math.pow(2.0, -20) : 1.0e-6;
double gigaScale = opt.mibibytes ? Math.pow(2.0, -30) : 1.0e-9;
if (!opt.csv) {
String vendor = System.getProperty("java.vendor");
String ver = System.getProperty("java.version");
String home = System.getProperty("java.home");
System.out.println("BabelStream");
System.out.printf("Version: %s%n", VERSION);
System.out.printf(
"Implementation: %s (Java %s; %s; JAVA_HOME=%s)%n", name, ver, vendor, home);
final String benchmarkName;
switch (config.benchmark) {
case NSTREAM:
benchmarkName = "nstream";
break;
case TRIAD:
benchmarkName = "triad";
break;
case ALL:
benchmarkName = "all";
break;
default:
throw new AssertionError("Unexpected value: " + config.benchmark);
}
System.out.println("Running " + benchmarkName + " " + opt.numtimes + " times");
if (config.benchmark == Benchmark.TRIAD) {
System.out.println("Number of elements: " + opt.arraysize);
}
System.out.println("Precision: " + (opt.useFloat ? "float" : "double"));
System.out.printf(
"Array size: %.1f %s (=%.1f %s)%n",
(megaScale * arrayBytes), megaSuffix, (gigaScale * arrayBytes), gigaSuffix);
System.out.printf(
"Total size: %.1f %s (=%.1f %s)%n",
(megaScale * totalBytes), megaSuffix, (gigaScale * totalBytes), gigaSuffix);
}
JavaStream<T> stream = mkStream.apply(config);
stream.initArrays();
final boolean ok;
switch (config.benchmark) {
case ALL:
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
case NSTREAM:
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
case TRIAD:
Duration triadResult = stream.runTriad(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
default:
throw new AssertionError();
}
return ok;
}
private static <T extends Number> boolean checkWithinTolerance(
String name, T[] xs, T gold, T tolerance) {
// it's ok to default to double for error calculation
double error =
Arrays.stream(xs)
.mapToDouble(x -> Math.abs(minus(x, gold).doubleValue()))
.summaryStatistics()
.getAverage();
boolean failed = error > tolerance.doubleValue();
if (failed) {
System.err.printf("Validation failed on %s. Average error %s%n", name, error);
}
return !failed;
}
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
static <T extends Number> boolean checkSolutions(
Data<T> data, Config<T> config, Optional<T> dotSum) {
T goldA = config.initA;
T goldB = config.initB;
T goldC = config.initC;
for (int i = 0; i < config.options.numtimes; i++) {
switch (config.benchmark) {
case ALL:
goldC = goldA;
goldB = times(config.scalar, goldC);
goldC = plus(goldA, goldB);
goldA = plus(goldB, times(config.scalar, goldC));
break;
case TRIAD:
goldA = plus(goldB, times(config.scalar, goldC));
break;
case NSTREAM:
goldA = plus(goldA, plus(goldB, times(config.scalar, goldC)));
break;
}
}
T tolerance = times(config.ulp, from(config.evidence, 100));
boolean aValid = checkWithinTolerance("a", data.a, goldA, tolerance);
boolean bValid = checkWithinTolerance("b", data.b, goldB, tolerance);
boolean cValid = checkWithinTolerance("c", data.c, goldC, tolerance);
final T finalGoldA = goldA;
final T finalGoldB = goldB;
boolean sumValid =
dotSum
.map(
actual -> {
T goldSum =
times(
times(finalGoldA, finalGoldB),
from(config.evidence, config.options.arraysize));
double error = Math.abs(divide(minus(actual, goldSum), goldSum).doubleValue());
boolean failed = error > config.options.dotTolerance;
if (failed) {
System.err.printf(
"Validation failed on sum. Error %s \nSum was %s but should be %s%n",
error, actual, goldSum);
}
return !failed;
})
.orElse(true);
return aValid && bValid && cValid && sumValid;
}
private static double durationToSeconds(Duration d) {
return d.toNanos() / (double) TimeUnit.SECONDS.toNanos(1);
}
private static List<Entry<String, String>> mkCsvRow(
List<Duration> xs, String name, int totalBytes, double megaScale, Options opt) {
DoubleSummaryStatistics stats =
xs.stream().skip(1).mapToDouble(Main::durationToSeconds).summaryStatistics();
if (stats.getCount() <= 0) {
throw new IllegalArgumentException("No min/max for " + name + "(size=" + totalBytes + ")");
}
double mbps = megaScale * (double) totalBytes / stats.getMin();
return opt.csv
? Arrays.asList(
new SimpleImmutableEntry<>("function", name),
new SimpleImmutableEntry<>("num_times", opt.numtimes + ""),
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
new SimpleImmutableEntry<>(
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", mbps + ""),
new SimpleImmutableEntry<>("min_runtime", stats.getMin() + ""),
new SimpleImmutableEntry<>("max_runtime", stats.getMax() + ""),
new SimpleImmutableEntry<>("avg_runtime", stats.getAverage() + ""))
: Arrays.asList(
new SimpleImmutableEntry<>("Function", name),
new SimpleImmutableEntry<>(
"M" + (opt.mibibytes ? "i" : "") + "Bytes/sec", String.format("%.3f", mbps)),
new SimpleImmutableEntry<>("Min (sec)", String.format("%.5f", stats.getMin())),
new SimpleImmutableEntry<>("Max", String.format("%.5f", stats.getMax())),
new SimpleImmutableEntry<>("Average", String.format("%.5f", stats.getAverage())));
}
private static String padSpace(String s, int length) {
if (length == 0) return s;
return String.format("%1$-" + length + "s", s);
}
@SafeVarargs
@SuppressWarnings("varargs")
private static void tabulateCsv(boolean csv, List<Entry<String, String>>... rows) {
if (rows.length == 0) throw new IllegalArgumentException("Empty tabulation");
int padding = csv ? 0 : 12;
String sep = csv ? "," : "";
System.out.println(
rows[0].stream().map(x -> padSpace(x.getKey(), padding)).collect(Collectors.joining(sep)));
for (List<Entry<String, String>> row : rows) {
System.out.println(
row.stream().map(x -> padSpace(x.getValue(), padding)).collect(Collectors.joining(sep)));
}
}
private static final String VERSION = "3.4";
private static final float START_SCALAR = 0.4f;
private static final float START_A = 0.1f;
private static final float START_B = 0.2f;
private static final float START_C = 0.0f;
private static final List<Implementation> IMPLEMENTATIONS =
Arrays.asList(
new Implementation("jdk-stream", JdkStreams.FLOAT, JdkStreams.DOUBLE),
new Implementation("jdk-plain", PlainStream.FLOAT, PlainStream.DOUBLE),
new Implementation("tornadovm", TornadoVMStreams.FLOAT, TornadoVMStreams.DOUBLE),
new Implementation("aparapi", AparapiStreams.FLOAT, AparapiStreams.DOUBLE));
public static int run(String[] args) {
Options opt = new Options();
JCommander.newBuilder().addObject(opt).build().parse(args);
final Benchmark benchmark;
if (opt.nstreamOnly && opt.triadOnly)
throw new RuntimeException(
"Both triad and nstream are enabled, pick one or omit both to run all benchmarks");
else if (opt.nstreamOnly) benchmark = Benchmark.NSTREAM;
else if (opt.triadOnly) benchmark = Benchmark.TRIAD;
else benchmark = Benchmark.ALL;
final Config<Float> floatConfig =
new Config<>(
opt,
benchmark,
Float.BYTES,
Float.class, // XXX not Float.TYPE, we want the boxed one
Math.ulp(1.f),
START_SCALAR,
START_A,
START_B,
START_C);
final Config<Double> doubleConfig =
new Config<>(
opt,
benchmark,
Double.BYTES,
Double.class, // XXX not Double.TYPE, we want the boxed one
Math.ulp(1.d),
(double) START_SCALAR,
(double) START_A,
(double) START_B,
(double) START_C);
if (opt.list) {
System.out.println("Set implementation with --impl <IMPL> and device with --device <N>:");
for (Implementation entry : IMPLEMENTATIONS) {
System.out.println("Implementation: " + entry.name);
try {
List<String> devices = entry.makeDouble.apply(doubleConfig).listDevices();
for (int i = 0; i < devices.size(); i++) {
System.out.println("\t[" + i + "] " + devices.get(i));
}
} catch (Exception e) {
System.out.println("\t(Unsupported: " + e.getMessage() + ")");
}
}
return 0;
}
String implName = (opt.impl.isEmpty()) ? IMPLEMENTATIONS.get(0).name : opt.impl;
Implementation impl =
IMPLEMENTATIONS.stream()
.filter(x -> implName.compareToIgnoreCase(x.name) == 0)
.findFirst()
.orElseThrow(
() ->
new IllegalArgumentException("Implementation " + opt.impl + " does not exist"));
boolean ok =
opt.useFloat
? run(impl.name, floatConfig, impl.makeFloat)
: run(impl.name, doubleConfig, impl.makeDouble);
return ok ? 0 : 1;
}
public static void main(String[] args) {
System.exit(run(args));
}
}

View File

@ -0,0 +1,129 @@
package javastream.aparapi;
import com.aparapi.device.Device;
import com.aparapi.device.Device.TYPE;
import com.aparapi.device.JavaDevice;
import com.aparapi.device.OpenCLDevice;
import com.aparapi.internal.kernel.KernelManager;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javastream.JavaStream;
import javastream.Main.Config;
public final class AparapiStreams {
private AparapiStreams() {}
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config -> new Generic<>(config, SpecialisedDoubleKernel::new);
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config -> new Generic<>(config, SpecialisedFloatKernel::new);
private static List<Device> enumerateDevices() {
// JavaDevice.SEQUENTIAL doesn't work when arraysize > 1, so we omit it entirely
Stream<JavaDevice> cpuDevices = Stream.of(JavaDevice.ALTERNATIVE_ALGORITHM);
Stream<OpenCLDevice> clDevices =
Stream.of(TYPE.values()).map(OpenCLDevice::listDevices).flatMap(Collection::stream);
return Stream.concat(clDevices, cpuDevices).collect(Collectors.toList());
}
private static String deviceName(Device device) {
return device.toString();
}
private static final class Generic<T extends Number> extends JavaStream<T> {
private final GenericAparapiStreamKernel<T> kernels;
Generic(Config<T> config, GenericAparapiStreamKernel.Factory<T> factory) {
super(config);
Device device = enumerateDevices().get(config.options.device);
final int numGroups;
final int workGroupSize;
if (device instanceof JavaDevice) {
numGroups = Runtime.getRuntime().availableProcessors();
workGroupSize =
config.typeSize * 2; // closest thing to CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE
} else if (device instanceof OpenCLDevice) {
numGroups = ((OpenCLDevice) device).getMaxComputeUnits();
workGroupSize = device.getMaxWorkGroupSize();
} else {
throw new AssertionError("Unknown device type " + device.getClass());
}
if (config.options.isVerboseBenchmark()) {
System.out.println("Using Aparapi OpenCL device: " + device);
System.out.println(" - numGroups : " + numGroups);
System.out.println(" - workGroupSize : " + workGroupSize);
String showCL = System.getProperty("com.aparapi.enableShowGeneratedOpenCL");
if (showCL == null || !showCL.equals("true")) {
System.out.println(
"(Add `-Dcom.aparapi.enableShowGeneratedOpenCL=true` to show generated OpenCL source)");
}
}
LinkedHashSet<Device> candidate = new LinkedHashSet<>();
candidate.add(device);
kernels = factory.create(config, numGroups, workGroupSize);
KernelManager.instance().setPreferredDevices(kernels, candidate);
}
@Override
public List<String> listDevices() {
return enumerateDevices().stream()
.map(AparapiStreams::deviceName)
.collect(Collectors.toList());
}
@Override
public void initArrays() {
kernels.init();
}
@Override
public void copy() {
kernels.copy();
}
@Override
public void mul() {
kernels.mul();
}
@Override
public void add() {
kernels.add();
}
@Override
public void triad() {
kernels.triad();
}
@Override
public void nstream() {
kernels.nstream();
}
@Override
public T dot() {
return kernels.dot();
}
@Override
public Data<T> data() {
return kernels.syncAndDispose();
}
}
}

View File

@ -0,0 +1,68 @@
package javastream.aparapi;
import com.aparapi.Kernel;
import com.aparapi.Range;
import javastream.JavaStream.Data;
import javastream.Main.Config;
abstract class GenericAparapiStreamKernel<T> extends Kernel {
protected static final int FN_COPY = 1;
protected static final int FN_MUL = 2;
protected static final int FN_ADD = 3;
protected static final int FN_TRIAD = 4;
protected static final int FN_NSTREAM = 5;
protected static final int FN_DOT = 6;
protected final Config<T> config;
protected final int arraysize, numGroups, workGroupSize;
interface Factory<T> {
GenericAparapiStreamKernel<T> create(Config<T> config, int numGroups, int workGroupSize);
}
GenericAparapiStreamKernel(Config<T> config, int numGroups, int workGroupSize) {
this.config = config;
this.arraysize = config.options.arraysize;
this.numGroups = numGroups;
this.workGroupSize = workGroupSize;
setExplicit(true);
}
protected int function;
public abstract void init();
public void copy() {
function = FN_COPY;
execute(arraysize);
}
public void mul() {
function = FN_MUL;
execute(arraysize);
}
public void add() {
function = FN_ADD;
execute(arraysize);
}
public void triad() {
function = FN_TRIAD;
execute(arraysize);
}
public void nstream() {
function = FN_NSTREAM;
execute(arraysize);
}
protected Kernel partialDot() {
function = FN_DOT;
return execute(Range.create(numGroups * workGroupSize, workGroupSize));
}
abstract T dot();
abstract Data<T> syncAndDispose();
}

View File

@ -0,0 +1,74 @@
package javastream.aparapi;
import java.util.Arrays;
import javastream.JavaStream;
import javastream.JavaStream.Data;
import javastream.Main.Config;
final class SpecialisedDoubleKernel extends GenericAparapiStreamKernel<Double> {
private final double scalar;
final double[] a, b, c;
private final double[] partialSum;
@Local private final double[] workGroupSum;
SpecialisedDoubleKernel(Config<Double> config, int numGroups, int workGroupSize) {
super(config, numGroups, workGroupSize);
this.scalar = config.scalar;
this.a = new double[this.arraysize];
this.b = new double[this.arraysize];
this.c = new double[this.arraysize];
this.partialSum = new double[numGroups];
this.workGroupSum = new double[workGroupSize];
}
@SuppressWarnings("DuplicatedCode")
@Override
public void run() {
int i = getGlobalId();
if (function == FN_COPY) {
c[i] = a[i];
} else if (function == FN_MUL) {
b[i] = scalar * c[i];
} else if (function == FN_ADD) {
c[i] = a[i] + b[i];
} else if (function == FN_TRIAD) {
a[i] = b[i] + scalar * c[i];
} else if (function == FN_NSTREAM) {
a[i] += b[i] + scalar * c[i];
} else if (function == FN_DOT) {
int localId = getLocalId(0);
workGroupSum[localId] = 0.0;
for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i];
for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) {
localBarrier();
if (localId < offset) {
workGroupSum[localId] += workGroupSum[localId + offset];
}
}
if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId];
}
}
@Override
public void init() {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
put(a).put(b).put(c);
}
@Override
public Double dot() {
partialDot().get(partialSum);
double sum = 0;
for (double v : partialSum) sum += v;
return sum;
}
@Override
public Data<Double> syncAndDispose() {
get(a).get(b).get(c).dispose();
return new Data<>(JavaStream.boxed(a), JavaStream.boxed(b), JavaStream.boxed(c));
}
}

View File

@ -0,0 +1,75 @@
package javastream.aparapi;
import static javastream.JavaStream.boxed;
import java.util.Arrays;
import javastream.JavaStream.Data;
import javastream.Main.Config;
final class SpecialisedFloatKernel extends GenericAparapiStreamKernel<Float> {
private final float scalar;
final float[] a, b, c;
private final float[] partialSum;
@Local private final float[] workGroupSum;
SpecialisedFloatKernel(Config<Float> config, int numGroups, int workGroupSize) {
super(config, numGroups, workGroupSize);
this.scalar = config.scalar;
this.a = new float[this.arraysize];
this.b = new float[this.arraysize];
this.c = new float[this.arraysize];
this.partialSum = new float[numGroups];
this.workGroupSum = new float[workGroupSize];
}
@SuppressWarnings("DuplicatedCode")
@Override
public void run() {
int i = getGlobalId();
if (function == FN_COPY) {
c[i] = a[i];
} else if (function == FN_MUL) {
b[i] = scalar * c[i];
} else if (function == FN_ADD) {
c[i] = a[i] + b[i];
} else if (function == FN_TRIAD) {
a[i] = b[i] + scalar * c[i];
} else if (function == FN_NSTREAM) {
a[i] += b[i] + scalar * c[i];
} else if (function == FN_DOT) {
int localId = getLocalId(0);
workGroupSum[localId] = 0.f;
for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i];
for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) {
localBarrier();
if (localId < offset) {
workGroupSum[localId] += workGroupSum[localId + offset];
}
}
if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId];
}
}
@Override
public void init() {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
put(a).put(b).put(c);
}
@Override
public Float dot() {
partialDot().get(partialSum);
float sum = 0;
for (float v : partialSum) sum += v;
return sum;
}
@Override
public Data<Float> syncAndDispose() {
get(a).get(b).get(c).dispose();
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,92 @@
package javastream.jdk;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import java.lang.reflect.Array;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class GenericPlainStream<T extends Number> extends JavaStream<T> {
private final T[] a;
private final T[] b;
private final T[] c;
@SuppressWarnings("unchecked")
GenericPlainStream(Config<T> config) {
super(config);
this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = times(config.scalar, c[i]);
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = plus(a[i], b[i]);
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = plus(b[i], times(config.scalar, c[i]));
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = plus(a[i], plus(b[i], times(config.scalar, c[i])));
}
}
@Override
public T dot() {
T acc = from(config.evidence, 0);
for (int i = 0; i < config.options.arraysize; i++) {
acc = plus(acc, times(a[i], b[i]));
}
return acc;
}
@Override
public Data<T> data() {
return new Data<>(a, b, c);
}
}

View File

@ -0,0 +1,86 @@
package javastream.jdk;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.FractionalMaths;
import javastream.JavaStream;
import javastream.Main.Config;
/**
* We use
*
* <pre>Arrays.parallelSetAll</pre>
*
* <p>here as it internally calls
*
* <pre>IntStream.range(0, array.length).parallel().forEach(...)</pre>
*/
final class GenericStream<T extends Number> extends JavaStream<T> {
private final T[] a, b, c;
@SuppressWarnings("unchecked")
GenericStream(Config<T> config) {
super(config);
this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
Arrays.parallelSetAll(a, i -> config.initA);
Arrays.parallelSetAll(b, i -> config.initB);
Arrays.parallelSetAll(c, i -> config.initC);
}
@Override
public void copy() {
Arrays.parallelSetAll(c, i -> a[i]);
}
@Override
public void mul() {
Arrays.parallelSetAll(b, i -> times(config.scalar, c[i]));
}
@Override
public void add() {
Arrays.parallelSetAll(c, i -> plus(a[i], b[i]));
}
@Override
public void triad() {
Arrays.parallelSetAll(a, i -> plus(b[i], times(config.scalar, c[i])));
}
@Override
public void nstream() {
Arrays.parallelSetAll(a, i -> plus(a[i], plus(b[i], times(config.scalar, c[i]))));
}
@Override
public T dot() {
return IntStream.range(0, config.options.arraysize)
.parallel()
.mapToObj(i -> times(a[i], b[i]))
.reduce(from(config.evidence, 0), FractionalMaths::plus);
}
@Override
public Data<T> data() {
return new Data<>(a, b, c);
}
}

View File

@ -0,0 +1,26 @@
package javastream.jdk;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.function.Function;
import javastream.JavaStream;
import javastream.JavaStream.EnumeratedStream;
import javastream.Main.Config;
public final class JdkStreams {
private JdkStreams() {}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedFloatStream::new),
new SimpleImmutableEntry<>("generic", GenericStream::new));
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedDoubleStream::new),
new SimpleImmutableEntry<>("generic", GenericStream::new));
}

View File

@ -0,0 +1,26 @@
package javastream.jdk;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.function.Function;
import javastream.JavaStream;
import javastream.JavaStream.EnumeratedStream;
import javastream.Main.Config;
public final class PlainStream {
private PlainStream() {}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedPlainFloatStream::new),
new SimpleImmutableEntry<>("generic", GenericPlainStream::new));
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedPlainDoubleStream::new),
new SimpleImmutableEntry<>("generic", GenericPlainStream::new));
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedDoubleStream extends JavaStream<Double> {
private final double[] a, b, c;
SpecialisedDoubleStream(Config<Double> config) {
super(config);
this.a = new double[config.options.arraysize];
this.b = new double[config.options.arraysize];
this.c = new double[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(
i -> {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
});
}
@Override
public void copy() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i]);
}
@Override
public void mul() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> b[i] = config.scalar * c[i]);
}
@Override
public void add() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i] + b[i]);
}
@Override
public void triad() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] = b[i] + config.scalar * c[i]);
}
@Override
public void nstream() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] += b[i] + config.scalar * c[i]);
}
@Override
public Double dot() {
return IntStream.range(0, config.options.arraysize)
.parallel()
.mapToDouble(i -> a[i] * b[i])
.reduce(0f, Double::sum);
}
@Override
public Data<Double> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedFloatStream extends JavaStream<Float> {
private final float[] a, b, c;
SpecialisedFloatStream(Config<Float> config) {
super(config);
this.a = new float[config.options.arraysize];
this.b = new float[config.options.arraysize];
this.c = new float[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(
i -> {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
});
}
@Override
public void copy() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i]);
}
@Override
public void mul() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> b[i] = config.scalar * c[i]);
}
@Override
public void add() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i] + b[i]);
}
@Override
public void triad() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] = b[i] + config.scalar * c[i]);
}
@Override
public void nstream() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] += b[i] + config.scalar * c[i]);
}
@Override
public Float dot() {
return IntStream.range(0, config.options.arraysize) //
.parallel()
.mapToObj(i -> a[i] * b[i]) // XXX there isn't a specialised Stream for floats
.reduce(0f, Float::sum);
}
@Override
public Data<Float> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
private final double[] a;
private final double[] b;
private final double[] c;
SpecialisedPlainDoubleStream(Config<Double> config) {
super(config);
this.a = new double[config.options.arraysize];
this.b = new double[config.options.arraysize];
this.c = new double[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = config.scalar * c[i];
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i] + b[i];
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = b[i] + config.scalar * c[i];
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] += b[i] + config.scalar * c[i];
}
}
@Override
public Double dot() {
double acc = 0f;
for (int i = 0; i < config.options.arraysize; i++) {
acc += a[i] * b[i];
}
return acc;
}
@Override
public Data<Double> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedPlainFloatStream extends JavaStream<Float> {
private final float[] a;
private final float[] b;
private final float[] c;
SpecialisedPlainFloatStream(Config<Float> config) {
super(config);
this.a = new float[config.options.arraysize];
this.b = new float[config.options.arraysize];
this.c = new float[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = config.scalar * c[i];
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i] + b[i];
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = b[i] + config.scalar * c[i];
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] += b[i] + config.scalar * c[i];
}
}
@Override
public Float dot() {
float acc = 0f;
for (int i = 0; i < config.options.arraysize; i++) {
acc += a[i] * b[i];
}
return acc;
}
@Override
public Data<Float> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,98 @@
package javastream.tornadovm;
import java.util.List;
import java.util.stream.Collectors;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskSchedule;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
protected final TornadoDevice device;
protected TaskSchedule copyTask;
protected TaskSchedule mulTask;
protected TaskSchedule addTask;
protected TaskSchedule triadTask;
protected TaskSchedule nstreamTask;
protected TaskSchedule dotTask;
GenericTornadoVMStream(Config<T> config) {
super(config);
try {
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime();
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
device = devices.get(config.options.device);
if (config.options.isVerboseBenchmark()) {
System.out.println("Using TornadoVM device:");
System.out.println(" - Name : " + device.getDescription());
System.out.println(" - Id : " + device.getDeviceName());
System.out.println(" - Platform : " + device.getPlatformName());
System.out.println(" - Backend : " + device.getTornadoVMBackend().name());
}
} catch (Throwable e) {
throw new RuntimeException(
"Unable to initialise TornadoVM, make sure you are running the binary with the `tornado -jar ...` wrapper and not `java -jar ...`",
e);
}
}
protected static TaskSchedule mkSchedule() {
return new TaskSchedule("");
}
@Override
public List<String> listDevices() {
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
.map(d -> d.getDescription() + "(" + d.getDeviceName() + ")")
.collect(Collectors.toList());
}
@Override
public void initArrays() {
this.copyTask.warmup();
this.mulTask.warmup();
this.addTask.warmup();
this.triadTask.warmup();
this.nstreamTask.warmup();
this.dotTask.warmup();
}
@Override
public void copy() {
this.copyTask.execute();
}
@Override
public void mul() {
this.mulTask.execute();
}
@Override
public void add() {
this.addTask.execute();
}
@Override
public void triad() {
this.triadTask.execute();
}
@Override
public void nstream() {
this.nstreamTask.execute();
}
protected abstract T getSum();
@Override
public T dot() {
this.dotTask.execute();
return getSum();
}
}

View File

@ -0,0 +1,88 @@
package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
@SuppressWarnings("ManualArrayCopy")
private static void copy(int size, double[] a, double[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i];
}
}
private static void mul(int size, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
b[i] = scalar * c[i];
}
}
private static void add(int size, double[] a, double[] b, double[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
private static void triad(int size, double[] a, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] + scalar * c[i];
}
}
private static void nstream(int size, double[] a, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] * scalar * c[i];
}
}
private static void dot_(
double[] a, double[] b, @Reduce double[] acc) { // prevent name clash with CL's dot
acc[0] = 0;
for (@Parallel int i = 0; i < a.length; i++) {
acc[0] += a[i] * b[i];
}
}
private final double[] a, b, c;
private final double[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
SpecialisedDouble(Config<Double> config) {
super(config);
final int size = config.options.arraysize;
final double scalar = config.scalar;
a = new double[size];
b = new double[size];
c = new double[size];
dotSum = new double[1];
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum);
}
@Override
public void initArrays() {
super.initArrays();
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
}
@Override
protected Double getSum() {
return dotSum[0];
}
@Override
public Data<Double> data() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,88 @@
package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
@SuppressWarnings("ManualArrayCopy")
private static void copy(int size, float[] a, float[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i];
}
}
private static void mul(int size, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
b[i] = scalar * c[i];
}
}
private static void add(int size, float[] a, float[] b, float[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
private static void triad(int size, float[] a, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] + scalar * c[i];
}
}
private static void nstream(int size, float[] a, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] * scalar * c[i];
}
}
private static void dot_(
float[] a, float[] b, @Reduce float[] acc) { // prevent name clash with CL's dot
acc[0] = 0;
for (@Parallel int i = 0; i < a.length; i++) {
acc[0] += a[i] * b[i];
}
}
private final float[] a, b, c;
private final float[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
SpecialisedFloat(Config<Float> config) {
super(config);
final int size = config.options.arraysize;
final float scalar = config.scalar;
a = new float[size];
b = new float[size];
c = new float[size];
dotSum = new float[1];
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum);
}
@Override
public void initArrays() {
super.initArrays();
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
}
@Override
protected Float getSum() {
return dotSum[0];
}
@Override
public Data<Float> data() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,42 @@
package javastream.tornadovm;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
public final class TornadoVMStreams {
private TornadoVMStreams() {}
static void xferToDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
}
}
static void xferFromDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn();
}
}
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) {
return IntStream.range(0, runtime.getNumDrivers())
.mapToObj(runtime::getDriver)
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))
.collect(Collectors.toList());
}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT = SpecialisedFloat::new;
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE = SpecialisedDouble::new;
}

View File

@ -0,0 +1,93 @@
package javastream;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
public class SmokeTest {
// taken from https://stackoverflow.com/a/32146095/896997
private static <T> Stream<List<T>> ofCombinations(
List<? extends Collection<T>> collections, List<T> current) {
return collections.isEmpty()
? Stream.of(current)
: collections.get(0).stream()
.flatMap(
e -> {
List<T> list = new ArrayList<>(current);
list.add(e);
return ofCombinations(collections.subList(1, collections.size()), list);
});
}
@SuppressWarnings("unused")
private static Stream<Arguments> options() {
LinkedHashMap<String, List<Integer>> impls = new LinkedHashMap<>();
impls.put("jdk-stream", Arrays.asList(0, 1));
impls.put("jdk-plain", Arrays.asList(0, 1));
// skip aparapi as none of the jdk fallbacks work correctly
// skip tornadovm as it has no jdk fallback
List<String> configs =
impls.entrySet().stream()
.flatMap(
e ->
Stream.concat(Stream.of(""), e.getValue().stream().map(i -> "--device " + i))
.map(d -> "--impl " + e.getKey() + " " + d))
.collect(Collectors.toList());
return ofCombinations(
new ArrayList<>(
Arrays.asList(
configs,
Arrays.asList("", "--csv"),
// XXX floats usually have a 1.0^-5 error which misses 10^-8
Arrays.asList("", "--float --dot-tolerance 1.0e-5"),
Arrays.asList("", "--triad-only", "--nstream-only"),
Arrays.asList("", "--mibibytes"))),
Collections.emptyList())
.map(
xs ->
Arguments.of(
xs.stream() //
.map(String::trim) //
.collect(Collectors.joining(" "))
.trim()));
}
@ParameterizedTest
@MethodSource("options")
void testIt(String args) {
String line = "--arraysize 2048 " + args;
// redirect stdout/stderr and only print if anything fails
ByteArrayOutputStream outContent = new ByteArrayOutputStream();
ByteArrayOutputStream errContent = new ByteArrayOutputStream();
PrintStream originalOut = System.out;
PrintStream originalErr = System.err;
System.setOut(new PrintStream(outContent));
System.setErr(new PrintStream(errContent));
int run = Main.run(line.split("\\s+"));
System.setOut(originalOut);
System.setErr(originalErr);
if (run != 0) {
System.out.println(outContent);
System.err.println(errContent);
Assertions.assertEquals(0, run, "`" + line + "` did not return 0");
}
}
}

View File

@ -0,0 +1 @@
{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]}

1
scala-stream/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
target/

2
scala-stream/.jvmopts Normal file
View File

@ -0,0 +1,2 @@
-Xmx4096m
-Xss4m

View File

@ -0,0 +1,34 @@
version = "3.0.0-RC2"
runner.dialect = scala3
style = defaultWithAlign
maxColumn = 100
align.preset = more
rewrite.rules = [
AvoidInfix
RedundantBraces
RedundantParens
AsciiSortImports
PreferCurlyFors
]
rewrite.neverInfix.excludeFilters = [until
to
by
eq
ne
"should.*"
"contain.*"
"must.*"
in
be
taggedAs
thrownBy
synchronized
have
when
size
theSameElementsAs]

102
scala-stream/README.md Normal file
View File

@ -0,0 +1,102 @@
ScalaStream
===========
This is an implementation of BabelStream
in [Scala 3](https://docs.scala-lang.org/scala3/new-in-scala3.html) on the JVM. In theory, this
implementation also covers Java. Scala and Java, like any other programming language, has its own
ecosystem of library supported parallel programming frameworks, we currently implement the
following:
* Parallel streams (introduced in Java 8) - `src/main/scala/scalastream/J8SStream.scala`
* [Scala Parallel Collections](https://github.com/scala/scala-parallel-collections)
- `src/main/scala/scalastream/ParStream.scala`
As the benchmark is relatively simple, we also implement some baselines:
* Single threaded Scala `for` (i.e `foreach` sugar) - `src/main/scala/scalastream/PlainStream.scala`
* Manually parallelism with Java executors - `src/main/scala/scalastream/ThreadedStream.scala`
### Performance considerations
As Scala 3 defaults to Scala 2.13's standard library, we roll our own `Fractional` typeclass with
liberal use of inlining and specialisation. This is motivated by 2.13 stdlib's lack of
specialisation for primitives types on the default `Fractional` and `Numeric` typeclasses.
The use of [Spire](https://github.com/typelevel/spire) to mitigate this was attempted, however, due
to its use of Scala 2 macros, it currently doesn't compile with Scala 3.
### Build & Run
Prerequisites
* JDK >= 8 on any of its supported platform; known working implementations:
- OpenJDK
distributions ([Amazon Corretto](https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/downloads-list.html)
, [Azul](https://www.azul.com/downloads/?version=java-11-lts&package=jdk)
, [AdoptOpenJDK](https://adoptopenjdk.net/), etc)
- Oracle Graal CE/EE 8+
To run the benchmark, first create a binary:
```shell
> ./sbt assembly
```
The binary will be located at `./target/scala-3.0.0/scala-stream.jar`. Run it with:
```shell
> java -version
openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment 18.9 (build 11.0.11+9)
OpenJDK 64-Bit Server VM 18.9 (build 11.0.11+9, mixed mode, sharing)
> java -jar target/scala-3.0.0/scala-stream.jar --help
```
For best results, benchmark with the following JVM flags:
```
-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance
-XX:-TieredCompilation # disable C1, go straight to C2
-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point
```
Worked example:
```shell
> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/scala-3.0.0/scala-stream.jar
BabelStream
Version: 3.4.0
Implementation: Scala Parallel Collections; Scala (Java 11.0.11; Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64)
Running kernels 100 times
Precision: double
Array size: 268.4 MB (=0.3 GB)
Total size: 805.3 MB (=0.8 GB)
Function MBytes/sec Min (sec) Max Average
Copy 4087.077 0.13136 0.24896 0.15480
Mul 2934.709 0.18294 0.28706 0.21627
Add 3016.342 0.26698 0.39835 0.31119
Triad 3016.496 0.26697 0.37612 0.31040
Dot 2216.096 0.24226 0.41235 0.28264
```
### Graal Native Image
The port has partial support for Graal Native Image, to generate one, run:
```shell
> ./sbt nativeImage
```
The ELF binary will be located at `./target/native-image/scala-stream`, relocation should work on
the same architecture the binary is built on.
There's an ongoing bug with Scala 3 's use of `lazy val`s where the program crashes at declaration
site. Currently, Scala Parallel Collections uses this feature internally, so selecting this device
will crash at runtime.
The bug originates from the use of `Unsafe` in `lazy val` for thready safety guarantees. It seems
that Graal only supports limited uses of this JVM implementation detail and Scala 3 happens to be on
the unsupported side.

29
scala-stream/build.sbt Normal file
View File

@ -0,0 +1,29 @@
lazy val mainCls = Some("scalastream.App")
lazy val root = (project in file("."))
.enablePlugins(NativeImagePlugin)
.settings(
scalaVersion := "3.0.0",
version := "3.4.0",
organization := "uk.ac.bristol.uob-hpc",
organizationName := "University of Bristol",
Compile / mainClass := mainCls,
assembly / mainClass := mainCls,
scalacOptions ~= filterConsoleScalacOptions,
assembly / assemblyJarName := "scala-stream.jar",
nativeImageOptions := Seq(
"--no-fallback",
"-H:ReflectionConfigurationFiles=../../reflect-config.json"
),
nativeImageVersion := "21.1.0",
(Global / excludeLintKeys) += nativeImageVersion,
name := "scala-stream",
libraryDependencies ++= Seq(
// Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part
("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13),
// par also uses lazy val at some point, so it doesn't work in nativeImage
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3",
"net.openhft" % "affinity" % "3.21ea1",
"org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity
)
)

View File

@ -0,0 +1 @@
sbt.version=1.5.2

View File

@ -0,0 +1,6 @@
addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17")
addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2")

View File

@ -0,0 +1,11 @@
[
{
"name": "sun.misc.Unsafe",
"fields": [
{
"name": "theUnsafe",
"allowUnsafeAccess": true
}
]
}
]

3
scala-stream/sbt Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
./sbt-dist/bin/sbt "$@"

Binary file not shown.

177
scala-stream/sbt-dist/bin/sbt Executable file
View File

@ -0,0 +1,177 @@
#!/usr/bin/env bash
### ------------------------------- ###
### Helper methods for BASH scripts ###
### ------------------------------- ###
realpath () {
(
TARGET_FILE="$1"
FIX_CYGPATH="$2"
cd "$(dirname "$TARGET_FILE")"
TARGET_FILE=$(basename "$TARGET_FILE")
COUNT=0
while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
do
TARGET_FILE=$(readlink "$TARGET_FILE")
cd "$(dirname "$TARGET_FILE")"
TARGET_FILE=$(basename "$TARGET_FILE")
COUNT=$(($COUNT + 1))
done
# make sure we grab the actual windows path, instead of cygwin's path.
if [[ "x$FIX_CYGPATH" != "x" ]]; then
echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")"
else
echo "$(pwd -P)/$TARGET_FILE"
fi
)
}
# Uses uname to detect if we're in the odd cygwin environment.
is_cygwin() {
local os=$(uname -s)
case "$os" in
CYGWIN*) return 0 ;;
MINGW*) return 0 ;;
MSYS*) return 0 ;;
*) return 1 ;;
esac
}
# TODO - Use nicer bash-isms here.
CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi)
# This can fix cygwin style /cygdrive paths so we get the
# windows style paths.
cygwinpath() {
local file="$1"
if [[ "$CYGWIN_FLAG" == "true" ]]; then
echo $(cygpath -w $file)
else
echo $file
fi
}
. "$(dirname "$(realpath "$0")")/sbt-launch-lib.bash"
declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
declare -r sbt_opts_file=".sbtopts"
declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
declare -r dist_sbt_opts_file="${sbt_home}/conf/sbtopts"
declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt"
usage() {
cat <<EOM
Usage: `basename "$0"` [options]
-h | -help print this message
-v | -verbose this runner is chattier
-d | -debug set sbt log level to debug
-no-colors disable ANSI color codes
-sbt-create start sbt even if current directory contains no sbt project
-sbt-dir <path> path to global settings/plugins directory (default: ~/.sbt)
-sbt-boot <path> path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
-ivy <path> path to local Ivy repository (default: ~/.ivy2)
-mem <integer> set memory options (default: $sbt_default_mem, which is $(get_mem_opts))
-no-share use all local caches; no sharing
-no-global uses global caches, but does not use global ~/.sbt directory.
-jvm-debug <port> Turn on JVM debugging, open at the given port.
-batch Disable interactive mode
# sbt version (default: from project/build.properties if present, else latest release)
-sbt-version <version> use the specified version of sbt
-sbt-jar <path> use the specified jar as the sbt launcher
-sbt-rc use an RC version of sbt
-sbt-snapshot use a snapshot version of sbt
# java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
-java-home <path> alternate JAVA_HOME
# jvm options and output control
JAVA_OPTS environment variable, if unset uses "$java_opts"
.jvmopts if this file exists in the current directory, its contents
are appended to JAVA_OPTS
SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
.sbtopts if this file exists in the current directory, its contents
are prepended to the runner args
/etc/sbt/sbtopts if this file exists, it is prepended to the runner args
-Dkey=val pass -Dkey=val directly to the java runtime
-J-X pass option -X directly to the java runtime
(-J is stripped)
-S-X add -X to sbt's scalacOptions (-S is stripped)
In the case of duplicated or conflicting options, the order above
shows precedence: JAVA_OPTS lowest, command line options highest.
EOM
}
process_my_args () {
while [[ $# -gt 0 ]]; do
case "$1" in
-no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
-no-share) addJava "$noshare_opts" && shift ;;
-no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
-sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
-sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
-debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
-batch) exec </dev/null && shift ;;
-sbt-create) sbt_create=true && shift ;;
new) sbt_new=true && addResidual "$1" && shift ;;
*) addResidual "$1" && shift ;;
esac
done
# Now, ensure sbt version is used.
[[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
# Confirm a user's intent if the current directory does not look like an sbt
# top-level directory and neither the -sbt-create option nor the "new"
# command was given.
[[ -f ./build.sbt || -d ./project || -n "$sbt_create" || -n "$sbt_new" ]] || {
echo "[warn] Neither build.sbt nor a 'project' directory in the current directory: $(pwd)"
while true; do
echo 'c) continue'
echo 'q) quit'
read -p '? ' || exit 1
case "$REPLY" in
c|C) break ;;
q|Q) exit 1 ;;
esac
done
}
}
loadConfigFile() {
# Make sure the last line is read even if it doesn't have a terminating \n
cat "$1" | sed $'/^\#/d;s/\r$//' | while read -r line || [[ -n "$line" ]]; do
eval echo $line
done
}
# Here we pull in the default settings configuration.
[[ -f "$dist_sbt_opts_file" ]] && set -- $(loadConfigFile "$dist_sbt_opts_file") "$@"
# Here we pull in the global settings configuration.
[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
# Pull in the project-level config file, if it exists.
[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
# Pull in the project-level java config, if it exists.
[[ -f ".jvmopts" ]] && export JAVA_OPTS="$JAVA_OPTS $(loadConfigFile .jvmopts)"
run "$@"

View File

@ -0,0 +1,363 @@
#!/usr/bin/env bash
#
# A library to simplify using the SBT launcher from other packages.
# Note: This should be used by tools like giter8/conscript etc.
# TODO - Should we merge the main SBT script with this library?
declare -a residual_args
declare -a java_args
declare -a scalac_args
declare -a sbt_commands
declare java_cmd=java
declare java_version
declare init_sbt_version="1.2.8"
declare sbt_default_mem=1024
declare SCRIPT=$0
while [ -h "$SCRIPT" ] ; do
ls=$(ls -ld "$SCRIPT")
# Drop everything prior to ->
link=$(expr "$ls" : '.*-> \(.*\)$')
if expr "$link" : '/.*' > /dev/null; then
SCRIPT="$link"
else
SCRIPT=$(dirname "$SCRIPT")/"$link"
fi
done
declare -r sbt_bin_dir="$(dirname "$SCRIPT")"
declare -r sbt_home="$(dirname "$sbt_bin_dir")"
echoerr () {
echo 1>&2 "$@"
}
vlog () {
[[ $verbose || $debug ]] && echoerr "$@"
}
dlog () {
[[ $debug ]] && echoerr "$@"
}
jar_file () {
echo "$(cygwinpath "${sbt_home}/bin/sbt-launch.jar")"
}
acquire_sbt_jar () {
sbt_jar="$(jar_file)"
if [[ ! -f "$sbt_jar" ]]; then
echoerr "Could not find launcher jar: $sbt_jar"
exit 2
fi
}
rt_export_file () {
echo "${sbt_bin_dir}/java9-rt-export.jar"
}
execRunner () {
# print the arguments one to a line, quoting any containing spaces
[[ $verbose || $debug ]] && echo "# Executing command line:" && {
for arg; do
if printf "%s\n" "$arg" | grep -q ' '; then
printf "\"%s\"\n" "$arg"
else
printf "%s\n" "$arg"
fi
done
echo ""
}
# THis used to be exec, but we loose the ability to re-hook stty then
# for cygwin... Maybe we should flag the feature here...
"$@"
}
addJava () {
dlog "[addJava] arg = '$1'"
java_args=( "${java_args[@]}" "$1" )
}
addSbt () {
dlog "[addSbt] arg = '$1'"
sbt_commands=( "${sbt_commands[@]}" "$1" )
}
addResidual () {
dlog "[residual] arg = '$1'"
residual_args=( "${residual_args[@]}" "$1" )
}
addDebugger () {
addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1"
}
get_mem_opts () {
# if we detect any of these settings in ${JAVA_OPTS} or ${JAVA_TOOL_OPTIONS} we need to NOT output our settings.
# The reason is the Xms/Xmx, if they don't line up, cause errors.
if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
echo ""
elif [[ "${JAVA_TOOL_OPTIONS}" == *-Xmx* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-Xms* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:ReservedCodeCacheSize* ]]; then
echo ""
elif [[ "${SBT_OPTS}" == *-Xmx* ]] || [[ "${SBT_OPTS}" == *-Xms* ]] || [[ "${SBT_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${SBT_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${SBT_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
echo ""
else
# a ham-fisted attempt to move some memory settings in concert
# so they need not be messed around with individually.
local mem=${1:-$sbt_default_mem}
local codecache=$(( $mem / 8 ))
(( $codecache > 128 )) || codecache=128
(( $codecache < 512 )) || codecache=512
local class_metadata_size=$(( $codecache * 2 ))
if [[ -z $java_version ]]; then
java_version=$(jdk_version)
fi
local class_metadata_opt=$((( $java_version < 8 )) && echo "MaxPermSize" || echo "MaxMetaspaceSize")
local arg_xms=$([[ "${java_args[@]}" == *-Xms* ]] && echo "" || echo "-Xms${mem}m")
local arg_xmx=$([[ "${java_args[@]}" == *-Xmx* ]] && echo "" || echo "-Xmx${mem}m")
local arg_rccs=$([[ "${java_args[@]}" == *-XX:ReservedCodeCacheSize* ]] && echo "" || echo "-XX:ReservedCodeCacheSize=${codecache}m")
local arg_meta=$([[ "${java_args[@]}" == *-XX:${class_metadata_opt}* && ! (( $java_version < 8 )) ]] && echo "" || echo "-XX:${class_metadata_opt}=${class_metadata_size}m")
echo "${arg_xms} ${arg_xmx} ${arg_rccs} ${arg_meta}"
fi
}
get_gc_opts () {
local older_than_9=$(( $java_version < 9 ))
if [[ "$older_than_9" == "1" ]]; then
# don't need to worry about gc
echo ""
elif [[ "${JAVA_OPTS}" =~ Use.*GC ]] || [[ "${JAVA_TOOL_OPTIONS}" =~ Use.*GC ]] || [[ "${SBT_OPTS}" =~ Use.*GC ]] ; then
# GC arg has been passed in - don't change
echo ""
else
# Java 9+ so revert to old
echo "-XX:+UseParallelGC"
fi
}
require_arg () {
local type="$1"
local opt="$2"
local arg="$3"
if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
echo "$opt requires <$type> argument"
exit 1
fi
}
is_function_defined() {
declare -f "$1" > /dev/null
}
# parses JDK version from the -version output line.
# 8 for 1.8.0_nn, 9 for 9-ea etc, and "no_java" for undetected
jdk_version() {
local result
local lines=$("$java_cmd" -Xms32M -Xmx32M -version 2>&1 | tr '\r' '\n')
local IFS=$'\n'
for line in $lines; do
if [[ (-z $result) && ($line = *"version \""*) ]]
then
local ver=$(echo $line | sed -e 's/.*version "\(.*\)"\(.*\)/\1/; 1q')
# on macOS sed doesn't support '?'
if [[ $ver = "1."* ]]
then
result=$(echo $ver | sed -e 's/1\.\([0-9]*\)\(.*\)/\1/; 1q')
else
result=$(echo $ver | sed -e 's/\([0-9]*\)\(.*\)/\1/; 1q')
fi
fi
done
if [[ -z $result ]]
then
result=no_java
fi
echo "$result"
}
process_args () {
while [[ $# -gt 0 ]]; do
case "$1" in
-h|-help) usage; exit 1 ;;
-v|-verbose) verbose=1 && shift ;;
-d|-debug) debug=1 && addSbt "-debug" && shift ;;
-ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
-mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
-jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
-batch) exec </dev/null && shift ;;
-sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
-sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
-java-home) require_arg path "$1" "$2" &&
java_cmd="$2/bin/java" &&
export JAVA_HOME="$2" &&
export JDK_HOME="$2" &&
export PATH="$2/bin:$PATH" &&
shift 2 ;;
"-D*") addJava "$1" && shift ;;
-J*) addJava "${1:2}" && shift ;;
*) addResidual "$1" && shift ;;
esac
done
is_function_defined process_my_args && {
myargs=("${residual_args[@]}")
residual_args=()
process_my_args "${myargs[@]}"
}
java_version="$(jdk_version)"
vlog "[process_args] java_version = '$java_version'"
}
# Extracts the preloaded directory from either -Dsbt.preloaded or -Dsbt.global.base
# properties by looking at:
# - _JAVA_OPTIONS environment variable,
# - SBT_OPTS environment variable,
# - JAVA_OPTS environment variable and
# - properties set by command-line options
# in that order. The last one will be chosen such that `sbt.preloaded` is
# always preferred over `sbt.global.base`.
getPreloaded() {
local -a _java_options_array
local -a sbt_opts_array
local -a java_opts_array
read -a _java_options_array <<< "$_JAVA_OPTIONS"
read -a sbt_opts_array <<< "$SBT_OPTS"
read -a java_opts_array <<< "$JAVA_OPTS"
local args_to_check=(
"${_java_options_array[@]}"
"${sbt_opts_array[@]}"
"${java_opts_array[@]}"
"${java_args[@]}")
local via_global_base="$HOME/.sbt/preloaded"
local via_explicit=""
for opt in "${args_to_check[@]}"; do
if [[ "$opt" == -Dsbt.preloaded=* ]]; then
via_explicit="${opt#-Dsbt.preloaded=}"
elif [[ "$opt" == -Dsbt.global.base=* ]]; then
via_global_base="${opt#-Dsbt.global.base=}/preloaded"
fi
done
echo "${via_explicit:-${via_global_base}}"
}
syncPreloaded() {
local source_preloaded="$sbt_home/lib/local-preloaded/"
local target_preloaded="$(getPreloaded)"
if [[ "$init_sbt_version" == "" ]]; then
# FIXME: better $init_sbt_version detection
init_sbt_version="$(ls -1 "$source_preloaded/org.scala-sbt/sbt/")"
fi
[[ -f "$target_preloaded/org.scala-sbt/sbt/$init_sbt_version/jars/sbt.jar" ]] || {
# lib/local-preloaded exists (This is optional)
[[ -d "$source_preloaded" ]] && {
command -v rsync >/dev/null 2>&1 && {
mkdir -p "$target_preloaded"
rsync -a --ignore-existing "$source_preloaded" "$target_preloaded"
}
}
}
}
# Detect that we have java installed.
checkJava() {
local required_version="$1"
# Now check to see if it's a good enough version
local good_enough="$(expr $java_version ">=" $required_version)"
if [[ "$java_version" == "" ]]; then
echo
echo "No Java Development Kit (JDK) installation was detected."
echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download.
echo
exit 1
elif [[ "$good_enough" != "1" ]]; then
echo
echo "The Java Development Kit (JDK) installation you have is not up to date."
echo $script_name requires at least version $required_version+, you have
echo version $java_version
echo
echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download
echo a valid JDK and install before running $script_name.
echo
exit 1
fi
}
copyRt() {
local at_least_9="$(expr $java_version ">=" 9)"
if [[ "$at_least_9" == "1" ]]; then
rtexport=$(rt_export_file)
# The grep for java9-rt-ext- matches the filename prefix printed in Export.java
java9_ext=$("$java_cmd" ${JAVA_OPTS} ${SBT_OPTS:-$default_sbt_opts} ${java_args[@]} \
-jar "$rtexport" --rt-ext-dir | grep java9-rt-ext-)
java9_rt=$(echo "$java9_ext/rt.jar")
vlog "[copyRt] java9_rt = '$java9_rt'"
if [[ ! -f "$java9_rt" ]]; then
echo Copying runtime jar.
mkdir -p "$java9_ext"
execRunner "$java_cmd" \
${JAVA_OPTS} \
${SBT_OPTS:-$default_sbt_opts} \
${java_args[@]} \
-jar "$rtexport" \
"${java9_rt}"
fi
addJava "-Dscala.ext.dirs=${java9_ext}"
fi
}
run() {
# process the combined args, then reset "$@" to the residuals
process_args "$@"
set -- "${residual_args[@]}"
argumentCount=$#
# Copy preloaded repo to user's preloaded directory
syncPreloaded
# no jar? download it.
[[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
# still no jar? uh-oh.
echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
exit 1
}
# TODO - java check should be configurable...
checkJava "6"
# Java 9 support
copyRt
#If we're in cygwin, we should use the windows config, and terminal hacks
if [[ "$CYGWIN_FLAG" == "true" ]]; then
stty -icanon min 1 -echo > /dev/null 2>&1
addJava "-Djline.terminal=jline.UnixTerminal"
addJava "-Dsbt.cygwin=true"
fi
# run sbt
execRunner "$java_cmd" \
$(get_mem_opts $sbt_mem) \
$(get_gc_opts) \
${JAVA_OPTS} \
${SBT_OPTS:-$default_sbt_opts} \
${java_args[@]} \
-jar "$sbt_jar" \
"${sbt_commands[@]}" \
"${residual_args[@]}"
exit_code=$?
# Clean up the terminal from cygwin hacks.
if [[ "$CYGWIN_FLAG" == "true" ]]; then
stty icanon echo > /dev/null 2>&1
fi
exit $exit_code
}

Binary file not shown.

View File

@ -0,0 +1,212 @@
@REM SBT launcher script
@REM
@REM Environment:
@REM JAVA_HOME - location of a JDK home dir (mandatory)
@REM SBT_OPTS - JVM options (optional)
@REM Configuration:
@REM sbtconfig.txt found in the SBT_HOME.
@REM ZOMG! We need delayed expansion to build up CFG_OPTS later
@setlocal enabledelayedexpansion
@echo off
set SBT_HOME=%~dp0
set SBT_ARGS=
rem FIRST we load the config file of extra options.
set FN=%SBT_HOME%\..\conf\sbtconfig.txt
set CFG_OPTS=
FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
set DO_NOT_REUSE_ME=%%i
rem ZOMG (Part #2) WE use !! here to delay the expansion of
rem CFG_OPTS, otherwise it remains "" for this loop.
set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
)
rem poor man's jenv (which is not available on Windows)
IF DEFINED JAVA_HOMES (
IF EXIST .java-version FOR /F %%A IN (.java-version) DO (
SET JAVA_HOME=%JAVA_HOMES%\%%A
SET JDK_HOME=%JAVA_HOMES%\%%A
)
)
rem must set PATH or wrong javac is used for java projects
IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%"
rem users can set JAVA_OPTS via .jvmopts (sbt-extras style)
IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO (
SET _jvmopts_line=%%A
IF NOT "!_jvmopts_line:~0,1!"=="#" (
SET JAVA_OPTS=%%A !JAVA_OPTS!
)
)
rem We use the value of the JAVACMD environment variable if defined
set _JAVACMD=%JAVACMD%
if "%_JAVACMD%"=="" (
if not "%JAVA_HOME%"=="" (
if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
)
)
if "%_JAVACMD%"=="" set _JAVACMD=java
rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
set _JAVA_OPTS=%JAVA_OPTS%
if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
set INIT_SBT_VERSION=1.2.8
:args_loop
if "%~1" == "" goto args_end
if "%~1" == "-jvm-debug" (
set JVM_DEBUG=true
set /a JVM_DEBUG_PORT=5005 2>nul >nul
) else if "!JVM_DEBUG!" == "true" (
set /a JVM_DEBUG_PORT=%1 2>nul >nul
if not "%~1" == "!JVM_DEBUG_PORT!" (
set SBT_ARGS=!SBT_ARGS! %1
)
) else if /I "%~1" == "new" (
set sbt_new=true
set SBT_ARGS=!SBT_ARGS! %1
) else (
set SBT_ARGS=!SBT_ARGS! %1
)
shift
goto args_loop
:args_end
rem Confirm a user's intent if the current directory does not look like an sbt
rem top-level directory and the "new" command was not given.
if not exist build.sbt (
if not exist project\ (
if not defined sbt_new (
echo [warn] Neither build.sbt nor a 'project' directory in the current directory: %CD%
setlocal
:confirm
echo c^) continue
echo q^) quit
set /P reply=?^
if /I "!reply!" == "c" (
goto confirm_end
) else if /I "!reply!" == "q" (
exit /B 1
)
goto confirm
:confirm_end
endlocal
)
)
)
call :process
call :checkjava
call :copyrt
if defined JVM_DEBUG_PORT (
set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT!
)
call :sync_preloaded
call :run %SBT_ARGS%
if ERRORLEVEL 1 goto error
goto end
:run
"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %*
goto :eof
:process
rem Parses x out of 1.x; for example 8 out of java version 1.8.0_xx
rem Otherwise, parses the major version; 9 out of java version 9-ea
set JAVA_VERSION=0
for /f "tokens=3" %%g in ('"%_JAVACMD%" -Xms32M -Xmx32M -version 2^>^&1 ^| findstr /i version') do (
set JAVA_VERSION=%%g
)
set JAVA_VERSION=%JAVA_VERSION:"=%
for /f "delims=.-_ tokens=1-2" %%v in ("%JAVA_VERSION%") do (
if /I "%%v" EQU "1" (
set JAVA_VERSION=%%w
) else (
set JAVA_VERSION=%%v
)
)
exit /B 0
:checkjava
set required_version=6
if /I %JAVA_VERSION% GEQ %required_version% (
exit /B 0
)
echo.
echo The Java Development Kit (JDK) installation you have is not up to date.
echo sbt requires at least version %required_version%+, you have
echo version %JAVA_VERSION%
echo.
echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download
echo a valid JDK and install before running sbt.
echo.
exit /B 1
:copyrt
if /I %JAVA_VERSION% GEQ 9 (
set rtexport=!SBT_HOME!java9-rt-export.jar
"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" --rt-ext-dir > "%TEMP%.\rtext.txt"
set /p java9_ext= < "%TEMP%.\rtext.txt"
set java9_rt=!java9_ext!\rt.jar
if not exist "!java9_rt!" (
mkdir "!java9_ext!"
"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" "!java9_rt!"
)
set _JAVA_OPTS=!_JAVA_OPTS! -Dscala.ext.dirs="!java9_ext!"
rem check to see if a GC has been set in the opts
echo !_JAVA_OPTS! | findstr /r "Use.*GC" >nul
if ERRORLEVEL 1 (
rem don't have a GC set - revert to old GC
set _JAVA_OPTS=!_JAVA_OPTS! -XX:+UseParallelGC
)
)
exit /B 0
:sync_preloaded
if "%INIT_SBT_VERSION%"=="" (
rem FIXME: better %INIT_SBT_VERSION% detection
FOR /F "tokens=* USEBACKQ" %%F IN (`dir /b "%SBT_HOME%\..\lib\local-preloaded\org.scala-sbt\sbt" /B`) DO (
SET INIT_SBT_VERSION=%%F
)
)
set PRELOAD_SBT_JAR="%UserProfile%\.sbt\preloaded\org.scala-sbt\sbt\%INIT_SBT_VERSION%\jars\sbt.jar"
if /I %JAVA_VERSION% GEQ 8 (
where robocopy >nul 2>nul
if %ERRORLEVEL% equ 0 (
REM echo %PRELOAD_SBT_JAR%
if not exist %PRELOAD_SBT_JAR% (
if exist "%SBT_HOME%\..\lib\local-preloaded\" (
echo "about to robocopy"
robocopy "%SBT_HOME%\..\lib\local-preloaded" "%UserProfile%\.sbt\preloaded" /E
)
)
)
)
exit /B 0
:error
@endlocal
exit /B 1
:end
@endlocal
exit /B 0

View File

@ -0,0 +1,14 @@
# Set the java args to high
-Xmx512M
-XX:MaxPermSize=256m
-XX:ReservedCodeCacheSize=128m
# Set the extra SBT options
-Dsbt.log.format=true

View File

@ -0,0 +1,49 @@
# ------------------------------------------------ #
# The SBT Configuration file. #
# ------------------------------------------------ #
# Disable ANSI color codes
#
#-no-colors
# Starts sbt even if the current directory contains no sbt project.
#
-sbt-create
# Path to global settings/plugins directory (default: ~/.sbt)
#
#-sbt-dir /etc/sbt
# Path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
#
#-sbt-boot ~/.sbt/boot
# Path to local Ivy repository (default: ~/.ivy2)
#
#-ivy ~/.ivy2
# set memory options
#
#-mem <integer>
# Use local caches for projects, no sharing.
#
#-no-share
# Put SBT in offline mode.
#
#-offline
# Sets the SBT version to use.
#-sbt-version 0.11.3
# Scala version (default: latest release)
#
#-scala-home <path>
#-scala-version <version>
# java version (default: java from PATH, currently $(java -version |& grep version))
#
#-java-home <path>

View File

@ -0,0 +1,44 @@
package scalastream
import scalastream.App.{Config, Data}
import scala.collection.immutable.ArraySeq
import scala.reflect.{ClassTag, classTag}
class J8SStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A])
extends ScalaStream[A]:
private var a: Array[A] = _
private var b: Array[A] = _
private var c: Array[A] = _
private val scalar: A = config.scalar
inline private def stream =
java.util.stream.IntStream.range(0, config.options.arraysize).parallel()
override inline def initArrays(): Unit =
a = Array.ofDim(config.options.arraysize)
b = Array.ofDim(config.options.arraysize)
c = Array.ofDim(config.options.arraysize)
stream.forEach { i =>
a(i) = config.init._1
b(i) = config.init._2
c(i) = config.init._3
}
override inline def copy(): Unit = stream.forEach(i => c(i) = a(i))
override inline def mul(): Unit = stream.forEach(i => b(i) = scalar * c(i))
override inline def add(): Unit = stream.forEach(i => c(i) = a(i) + b(i))
override inline def triad(): Unit = stream.forEach(i => a(i) = b(i) + scalar * c(i))
override inline def nstream(): Unit = stream.forEach(i => a(i) = b(i) * scalar * c(i))
override inline def dot(): A =
// horrible special-case for double, there isn't a mapToFloat so we give up on that
val cls = classTag[A].runtimeClass
if java.lang.Double.TYPE == cls then
stream
.mapToDouble(i => (a(i) * b(i)).asInstanceOf[Double])
.reduce(0, (l: Double, r: Double) => l + r)
.asInstanceOf[A]
else stream.mapToObj[A](i => a(i) * b(i)).reduce(0.fractional, (l: A, r: A) => l + r)
override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq))

View File

@ -0,0 +1,36 @@
package scalastream
import scalastream.App.{Config, Data}
import scala.collection.immutable.ArraySeq
import scala.collection.parallel.CollectionConverters._
import scala.reflect.ClassTag
class ParStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A])
extends ScalaStream[A]:
private var a: Array[A] = _
private var b: Array[A] = _
private var c: Array[A] = _
private val scalar: A = config.scalar
inline private def indices = (0 until config.options.arraysize).par
override inline def initArrays(): Unit =
a = Array.ofDim(config.options.arraysize)
b = Array.ofDim(config.options.arraysize)
c = Array.ofDim(config.options.arraysize)
for i <- indices do
a(i) = config.init._1
b(i) = config.init._2
c(i) = config.init._3
override inline def copy(): Unit = for i <- indices do c(i) = a(i)
override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i)
override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i)
override inline def triad(): Unit = for i <- indices do a(i) = b(i) + scalar * c(i)
override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i)
override inline def dot(): A =
indices.aggregate[A](0.fractional)((acc, i) => acc + (a(i) * b(i)), _ + _)
override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq))

View File

@ -0,0 +1,31 @@
package scalastream
import scalastream.App.{Config, Data}
import scala.collection.immutable.ArraySeq
import scala.reflect.ClassTag
class PlainStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A])
extends ScalaStream[A]:
private var a: Array[A] = _
private var b: Array[A] = _
private var c: Array[A] = _
private val scalar: A = config.scalar
override inline def initArrays(): Unit =
a = Array.fill(config.options.arraysize)(config.init._1)
b = Array.fill(config.options.arraysize)(config.init._2)
c = Array.fill(config.options.arraysize)(config.init._3)
private inline def indices = 0 until config.options.arraysize
override inline def copy(): Unit = for i <- indices do c(i) = a(i)
override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i)
override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i)
override inline def triad(): Unit = for i <- indices do a(i) = b(i) + (scalar * c(i))
override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i)
override inline def dot(): A =
var acc: A = 0.fractional
for i <- indices do acc = acc + (a(i) * b(i))
acc
override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq))

View File

@ -0,0 +1,369 @@
package scalastream
import scalastream.App.{Config, Data, Timings}
import java.util.concurrent.TimeUnit
import scala.collection.immutable.ArraySeq
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration.{Duration, FiniteDuration, SECONDS}
import scala.math.{Pi, pow}
import scala.reflect.ClassTag
import scopt.OParser
transparent trait ScalaStream[@specialized(Float, Double) A]:
def config: Config[A]
def initArrays(): Unit
def copy(): Unit
def mul(): Unit
def add(): Unit
def triad(): Unit
def nstream(): Unit
def dot(): A
transparent inline def timed[R](f: => R): (FiniteDuration, R) =
val start = System.nanoTime()
val r = f
val end = System.nanoTime()
FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r
inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) =
val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val add = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val triad = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val dot = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
var lastSum: A = 0.fractional
for i <- 0 until times do
copy(i) = timed(this.copy())._1
mul(i) = timed(this.mul())._1
add(i) = timed(this.add())._1
triad(i) = timed(this.triad())._1
val (dot_, sum) = timed(this.dot())
dot(i) = dot_
lastSum = sum
val s = lastSum
(
Timings(
copy = copy.toVector,
mul = mul.toVector,
add = add.toVector,
triad = triad.toVector,
dot = dot.toVector
),
s
)
def runTriad(times: Int): FiniteDuration = timed(for _ <- 0 until times do triad())._1
def runNStream(times: Int): Vector[FiniteDuration] = Vector.fill(times)(timed(nstream())._1)
def data(): Data[A]
trait Fractional[@specialized(Double, Float) A]:
def toFractional(f: Float): A
def toFractional(f: Double): A
def compare(x: A, y: A): Int
def add(x: A, y: A): A
def sub(x: A, y: A): A
def mul(x: A, y: A): A
def div(x: A, y: A): A
def abs(x: A): A
extension (x: Float) inline def fractional = toFractional(x)
extension (x: Double) inline def fractional = toFractional(x)
extension (x: Int) inline def fractional = toFractional(x.toFloat)
extension (x: Long) inline def fractional = toFractional(x.toDouble)
extension (x: A)
inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x)
end Fractional
given FloatFractional: Fractional[Float] with
inline def toFractional(f: Float): Float = f
inline def toFractional(f: Double): Float = f.toFloat
inline def compare(x: Float, y: Float): Int = x.compare(y)
inline def add(x: Float, y: Float): Float = x + y
inline def sub(x: Float, y: Float): Float = x - y
inline def mul(x: Float, y: Float): Float = x * y
inline def div(x: Float, y: Float): Float = x / y
inline def abs(x: Float): Float = math.abs(x)
given DoubleFractional: Fractional[Double] with
inline def toFractional(f: Float): Double = f.toDouble
inline def toFractional(f: Double): Double = f
inline def compare(x: Double, y: Double): Int = x.compare(y)
inline def add(x: Double, y: Double): Double = x + y
inline def sub(x: Double, y: Double): Double = x - y
inline def mul(x: Double, y: Double): Double = x * y
inline def div(x: Double, y: Double): Double = x / y
inline def abs(x: Double): Double = math.abs(x)
object App:
final val Version: String = "3.4.0"
case class Config[@specialized(Double, Float) A](
options: Options,
benchmark: Benchmark,
typeSize: Int,
ulp: A,
scalar: A,
init: (A, A, A)
)
case class Timings[A](copy: A, mul: A, add: A, triad: A, dot: A)
case class Data[A](@specialized(Double, Float) a: ArraySeq[A], b: ArraySeq[A], c: ArraySeq[A])
case class Options(
list: Boolean = false,
device: Int = 0,
numtimes: Int = 100,
arraysize: Int = 33554432,
float: Boolean = false,
triad_only: Boolean = false,
nstream_only: Boolean = false,
csv: Boolean = false,
mibibytes: Boolean = false
)
object Options:
val Default = Options()
val builder = OParser.builder[Options]
val parser1 =
import builder._
OParser.sequence(
programName("scala-stream"),
head("ScalaStream", s"$Version"),
opt[Unit]('l', "list").text("List available devices").action((_, x) => x.copy(list = true)),
opt[Int]('d', "device")
.text(s"Select device at <device>, defaults to ${Default.device}")
.action((v, x) => x.copy(device = v)),
opt[Int]('n', "numtimes")
.text(s"Run the test <numtimes> times (NUM >= 2), defaults to ${Default.numtimes}")
.validate {
case n if n >= 2 => success
case n => failure(s"$n <= 2")
}
.action((n, x) => x.copy(numtimes = n)),
opt[Int]('a', "arraysize")
.text(s"Use <arraysize> elements in the array, defaults to ${Default.arraysize}")
.action((v, x) => x.copy(arraysize = v)),
opt[Unit]('f', "float")
.text("Use floats (rather than doubles)")
.action((_, x) => x.copy(float = true)),
opt[Unit]('t', "triad_only")
.text("Only run triad")
.action((_, x) => x.copy(triad_only = true)),
opt[Unit]('n', "nstream_only")
.text("Only run nstream")
.action((_, x) => x.copy(nstream_only = true)),
opt[Unit]('c', "csv").text("Output as csv table").action((_, x) => x.copy(csv = true)),
opt[Unit]('m', "mibibytes")
.text("Use MiB=2^20 for bandwidth calculation (default MB=10^6)")
.action((_, x) => x.copy(mibibytes = true)),
help('h', "help").text("prints this usage text")
)
enum Benchmark:
case All, NStream, Triad
implicit class RichDuration(private val d: Duration) extends AnyVal:
def seconds: Double = d.toUnit(SECONDS)
def validate[A: Fractional](vec: Data[A], config: Config[A], dotSum: Option[A] = None): Unit =
var (goldA, goldB, goldC) = config.init
for _ <- 0 until config.options.numtimes do
config.benchmark match
case Benchmark.All =>
goldC = goldA
goldB = config.scalar * goldC
goldC = goldA + goldB
goldA = goldB + config.scalar * goldC
case Benchmark.Triad =>
goldA = goldB + config.scalar * goldC
case Benchmark.NStream =>
goldA += goldB + config.scalar * goldC
val tolerance = config.ulp * (100.fractional)
def validateXs(name: String, xs: Seq[A], from: A): Unit =
val error = xs.map(x => (x - from).abs_).fold(0.fractional)(_ + _) / xs.size.fractional
if error > tolerance then
Console.err.println(s"Validation failed on $name. Average error $error ")
validateXs("a", vec.a, goldA)
validateXs("b", vec.b, goldB)
validateXs("c", vec.c, goldC)
dotSum.foreach { sum =>
val goldSum = (goldA * goldB) * (config.options.arraysize).fractional
val error = ((sum - goldSum) / goldSum).abs_
if error > 1.fractional / 100000000.fractional then
Console.err.println(
s"Validation failed on sum. Error $error \nSum was $sum but should be $goldSum"
)
}
inline def run[A: Fractional: ClassTag](
name: String,
config: Config[A],
mkStream: Config[A] => ScalaStream[A]
): Unit =
val opt = config.options
val arrayBytes = opt.arraysize * config.typeSize
val totalBytes = arrayBytes * 3
val (megaScale, megaSuffix, gigaScale, gigaSuffix) =
if !opt.mibibytes then (1.0e-6, "MB", 1.0e-9, "GB")
else (pow(2.0, -20), "MiB", pow(2.0, -30), "GiB")
if !opt.csv then
val vendor = System.getProperty("java.vendor")
val ver = System.getProperty("java.version")
val home = System.getProperty("java.home")
println(
s"""BabelStream
|Version: $Version
|Implementation: $name; Scala (Java $ver; $vendor; home=$home)""".stripMargin
)
println(s"Running ${config.benchmark match {
case Benchmark.All => "kernels"
case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times")
if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}")
println(s"Precision: ${if opt.float then "float" else "double"}")
println(
f"Array size: ${megaScale * arrayBytes}%.1f $megaSuffix (=${gigaScale * arrayBytes}%.1f $gigaSuffix)"
)
println(
f"Total size: ${megaScale * totalBytes}%.1f $megaSuffix (=${gigaScale * totalBytes}%.1f $gigaSuffix)"
)
def mkRow(xs: Vector[FiniteDuration], name: String, totalBytes: Int) =
val tail = xs.tail
(tail.minOption.map(_.seconds), tail.maxOption.map(_.seconds)) match
case (Some(min), Some(max)) =>
val avg = (tail.foldLeft(Duration.Zero)(_ + _) / tail.size.toDouble).seconds
val mbps = megaScale * totalBytes.toDouble / min
if opt.csv then
Vector(
"function" -> name,
"num_times" -> opt.numtimes.toString,
"n_elements" -> opt.arraysize.toString,
"sizeof" -> totalBytes.toString,
s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" -> mbps.toString,
"min_runtime" -> min.toString,
"max_runtime" -> max.toString,
"avg_runtime" -> avg.toString
)
else
Vector(
"Function" -> name,
s"M${if opt.mibibytes then "i" else ""}Bytes/sec" -> f"$mbps%.3f",
"Min (sec)" -> f"$min%.5f",
"Max" -> f"$max%.5f",
"Average" -> f"$avg%.5f"
)
case (_, _) => sys.error(s"No min/max element for $name(size=$totalBytes)")
def tabulate(rows: Vector[(String, String)]*): Unit = rows.toList match
case Nil => sys.error(s"Empty tabulation")
case header :: _ =>
val padding = if opt.csv then 0 else 12
val sep = if opt.csv then "," else ""
println(header.map(_._1.padTo(padding, ' ')).mkString(sep))
println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n"))
val stream = mkStream(config)
stream.initArrays()
config.benchmark match
case Benchmark.All =>
val (results, sum) = stream.runAll(opt.numtimes)
validate(stream.data(), config, Some(sum))
tabulate(
mkRow(results.copy, "Copy", 2 * arrayBytes),
mkRow(results.mul, "Mul", 2 * arrayBytes),
mkRow(results.add, "Add", 3 * arrayBytes),
mkRow(results.triad, "Triad", 3 * arrayBytes),
mkRow(results.dot, "Dot", 2 * arrayBytes)
)
case Benchmark.NStream =>
val result = stream.runNStream(opt.numtimes)
validate(stream.data(), config)
tabulate(mkRow(result, "Nstream", 4 * arrayBytes))
case Benchmark.Triad =>
val results = stream.runTriad(opt.numtimes)
val totalBytes = 3 * arrayBytes * opt.numtimes
val bandwidth = megaScale * (totalBytes / results.seconds)
println(f"Runtime (seconds): ${results.seconds}%.5f")
println(f"Bandwidth ($gigaSuffix/s): $bandwidth%.3f ")
inline def devices[A: Fractional: ClassTag]: Vector[(String, Config[A] => ScalaStream[A])] =
Vector(
"Scala Parallel Collections" -> (ParStream(_)),
"Java 8 Stream" -> (J8SStream(_)),
"Threaded" -> (ThreadStream(_)),
"Serial" -> (PlainStream(_))
)
inline def runWith[A: Fractional: ClassTag](i: Int, config: Config[A]): Unit =
devices[A].lift(i) match
case None => println(s"Device index out of bounds: $i")
case Some((name, mkStream)) => run(name, config, mkStream)
def main(args: Array[String]): Unit =
def handleOpt(opt: Options) =
val benchmark = (opt.nstream_only, opt.triad_only) match
case (true, false) => Benchmark.NStream
case (false, true) => Benchmark.Triad
case (false, false) => Benchmark.All
case (true, true) =>
throw new RuntimeException(
"Both triad and nstream are enabled, pick one or omit both to run all benchmarks"
)
if opt.list then
devices[Float].zipWithIndex.foreach { case ((name, _), i) => println(s"$i: $name") }
else if opt.float then
runWith(
opt.device,
Config(
options = opt,
benchmark = benchmark,
typeSize = 4, // 32bit
ulp = math.ulp(Float.MaxValue),
scalar = 0.4f,
init = (0.1f, 0.2f, 0.0f)
)
)
else
runWith(
opt.device,
Config(
options = opt,
benchmark = benchmark,
typeSize = 8,
ulp = math.ulp(Double.MaxValue),
scalar = 0.4, // 64bit
init = (0.1, 0.2, 0.0)
)
)
OParser.parse(Options.parser1, args, Options.Default) match
case Some(config) => handleOpt(config)
case _ => sys.exit(1)

View File

@ -0,0 +1,68 @@
package scalastream
import net.openhft.affinity.{AffinityStrategies, AffinityThreadFactory}
import scalastream.App.{Config, Data}
import java.util.concurrent.{Callable, Executors}
import scala.collection.immutable.ArraySeq
import scala.reflect.ClassTag
object ThreadStream {}
class ThreadStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A])
extends ScalaStream[A]:
private var a: Array[A] = _
private var b: Array[A] = _
private var c: Array[A] = _
private val scalar: A = config.scalar
private val chunks: Int = sys.runtime.availableProcessors()
private val pool = Executors.newFixedThreadPool(
chunks,
new AffinityThreadFactory("scala-stream", true, AffinityStrategies.DIFFERENT_CORE)
)
private val indices = (0 until config.options.arraysize)
.grouped(config.options.arraysize / chunks)
.toSeq
private inline def forEachAll[C](c: => C)(f: (C, Int) => Unit): Seq[C] =
import scala.jdk.CollectionConverters._
val xs = pool
.invokeAll(
indices.map { r =>
{ () =>
val ctx = c
r.foreach(f(ctx, _))
ctx
}: Callable[C]
}.asJavaCollection
)
.asScala
.map(_.get())
.toSeq
xs
override inline def initArrays(): Unit =
a = Array.ofDim(config.options.arraysize)
b = Array.ofDim(config.options.arraysize)
c = Array.ofDim(config.options.arraysize)
forEachAll(()) { (_, i) =>
a(i) = config.init._1
b(i) = config.init._2
c(i) = config.init._3
}
()
class Box(var value: A)
override inline def copy(): Unit = { forEachAll(())((_, i) => c(i) = a(i)); () }
override inline def mul(): Unit = { forEachAll(())((_, i) => b(i) = scalar * c(i)); () }
override inline def add(): Unit = { forEachAll(())((_, i) => c(i) = a(i) + b(i)); () }
override inline def triad(): Unit = { forEachAll(())((_, i) => a(i) = b(i) + scalar * c(i)); () }
override inline def nstream(): Unit = { forEachAll(())((_, i) => a(i) = b(i) * scalar * c(i)); () }
override inline def dot(): A =
forEachAll(Box(0.fractional))((acc, i) => acc.value = acc.value + (a(i) * b(i)))
.map(_.value)
.fold(0.fractional)(_ + _)
override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq))

View File

@ -8,7 +8,46 @@ project(BabelStream VERSION 3.5 LANGUAGES CXX)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
#set(MODEL SYCL)
#set(SYCL_COMPILER COMPUTECPP)
#set(SYCL_COMPILER_DIR /home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu)
#set(MODEL RAJA)
#set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.13.0/)
#set(ENABLE_CUDA ON)
#set(TARGET NVIDIA)
#set(CUDA_TOOLKIT_ROOT_DIR /opt/cuda-11.2)
#set(CUDA_ARCH sm_70)
#set(BLT_DIR /home/tom/Downloads/blt-0.3.6/)
#set(MODEL STD)
#set(ARCH cc70)
#set(CXX_EXTRA_FLAGS -v)
#set(MODEL CUDA)
#set(ARCH sm_70)
#set(CMAKE_CUDA_COMPILER /opt/cuda-11.2/bin/nvcc)
#set(MODEL OCL)
#set(OpenCL_LIBRARY /opt/rocm-4.0.0/opencl/lib/libOpenCL.so)
#set(OpenCL_INCLUDE_DIR /opt/rocm-4.0.0/opencl/lib)
#set(RELEASE_FLAGS -Ofast)
#set(CXX_EXTRA_FLAGS -O2)
#set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++)
#set(MODEL OMP)
##set(OFFLOAD "AMD:gfx803")
#set(OFFLOAD "NVIDIA:sm_35")
#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/)
#set(OFFLOAD "AMD:_70")
#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/ --gcc-toolchain=/home/tom/spack/opt/spack/linux-fedora33-zen2/gcc-10.2.1/gcc-8.3.0-latmjo2hl2yv53255xkwko7k3y7bx2vv)
#set(CXX_EXTRA_LINKER_FLAGS )
#set(MODEL HIP)
#set(MODEL KOKKOS)
#set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.3.00/)
# the final executable name
set(EXE_NAME babelstream)
@ -73,6 +112,8 @@ register_model(sycl SYCL SYCLStream.cpp)
register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(raja USE_RAJA RAJAStream.cpp)
register_model(tbb TBB TBBStream.cpp)
register_model(thurst THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

View File

@ -134,20 +134,21 @@ setup_aocc() {
setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.2/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2"
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.2/bin/nvcc"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.2"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
echo "Installed CUDA versions:"
ls "$sdk_dir/cuda"
verify_bin_exists "$NVHPC_NVCXX"
@ -208,6 +209,20 @@ setup_raja() {
check_size
}
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
get_and_untar "$tarball" "$url"
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
verify_dir_exists "$TBB_LIB"
check_size
}
setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -237,10 +252,11 @@ setup_clang_gcc() {
setup_rocm() {
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq
sudo apt-get install -y -qq rocm-dev
sudo apt-get install -y -qq rocm-dev rocthrust-dev
export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
verify_bin_exists "$HIP_CXX"
"$HIP_CXX" --version
@ -354,6 +370,7 @@ if [ "$PARALLEL" = true ]; then
setup_dpcpp &
setup_kokkos &
setup_raja &
setup_tbb &
wait
else
setup_cmake
@ -364,6 +381,7 @@ else
setup_dpcpp
setup_kokkos
setup_raja
setup_tbb
# these need apt
setup_clang_gcc
setup_rocm

View File

@ -57,7 +57,7 @@ run_build() {
local cmake_code=$?
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
local cmake_code=$?
set -e
@ -92,11 +92,11 @@ run_build() {
# GCC_CXX="/usr/bin/g++"
# CLANG_CXX="/usr/bin/clang++"
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++"
# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.2"
# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
# AOMP_CXX="/usr/lib/aomp/bin/clang++"
@ -110,7 +110,7 @@ run_build() {
# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
# GCC_STD_PAR_LIB="tbb"
# CLANG_STD_PAR_LIB="tbb"
@ -122,7 +122,7 @@ run_build() {
AMD_ARCH="gfx_903"
NV_ARCH="sm_70"
NV_ARCH_CCXY="cuda11.2,cc80"
NV_ARCH_CCXY="cuda11.4,cc80"
build_gcc() {
local name="gcc_build"
@ -139,6 +139,9 @@ build_gcc() {
run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -166,6 +169,28 @@ build_gcc() {
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
fi
}
build_clang() {
@ -188,6 +213,18 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
}
@ -210,7 +247,11 @@ build_aomp() {
}
build_hip() {
run_build hip_build "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
local name="hip_build"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
}
build_icpx() {

View File

@ -25,6 +25,10 @@
#include "STDStream.h"
#elif defined(STD20)
#include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(THRUST)
#include "ThrustStream.h"
#elif defined(HIP)
#include "HIPStream.h"
#elif defined(HC)
@ -266,6 +270,14 @@ void run()
// Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(TBB)
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(THRUST)
// Use the Thrust implementation
stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);

View File

@ -3,7 +3,8 @@ ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
CLANG CRAY GNU GNU_PPC INTEL XL PGI NEC ARMCLANG AOMP
CLANG CRAY GNU GNU_PPC INTEL XL PGI
NEC ARMCLANG AOMP FUJITSU
Note: GCC on PPC requires -mcpu=native instead of -march=native so we have a special case for it
@ -49,6 +50,7 @@ COMPILER_XL = xlc++
COMPILER_PGI = pgc++
COMPILER_NEC = /opt/nec/ve/bin/nc++
COMPILER_AOMP = clang++
COMPILER_FUJITSU=FCC
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++11 -march=native
@ -61,6 +63,7 @@ FLAGS_PGI = -O3 -std=c++11
FLAGS_NEC = -O4 -finline -std=c++11
FLAGS_ARMCLANG = -O3 -std=c++11
FLAGS_AOMP = -O3 -std=c++11
FLAGS_FUJITSU=-Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16
CXXFLAGS = $(FLAGS_$(COMPILER))
# OpenMP flags for CPUs
@ -73,6 +76,7 @@ OMP_CLANG_CPU = -fopenmp=libomp
OMP_XL_CPU = -qsmp=omp -qthreaded
OMP_PGI_CPU = -mp
OMP_NEC_CPU = -fopenmp
OMP_FUJITSU_CPU=-Kopenmp
# OpenMP flags for NVIDIA
OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU

View File

@ -30,6 +30,9 @@
# ARMClang = ARM Compiler based on Clang (arm.com)
# These are only added in CMake 3.20:
# NVHPC = NVIDIA HPC SDK Compiler (nvidia.com)
# These are only added in CMake 3.21
# Fujitsu = Fujitsu HPC compiler (Trad mode)
# FujitsuClang = Fujitsu HPC compiler (Clang mode)
# CMAKE_SYSTEM_PROCESSOR is set via `uname -p`, we have:
@ -39,8 +42,9 @@
#
#predefined offload flags based on compiler id
# predefined offload flags based on compiler id and vendor,
# the format is (COMPILER and VENDOR must be UPPERCASE):
# Compiler: OMP_FLAGS_OFFLOAD_<COMPILER?>_<VNEDOR?>
set(OMP_FLAGS_OFFLOAD_INTEL
-qnextgen -fiopenmp -fopenmp-targets=spir64)
@ -56,15 +60,25 @@ set(OMP_FLAGS_OFFLOAD_CLANG_ARCH_FLAG
-march=) # prefix only, arch appended by the vendor:arch tuple
# for standard (non-offload) omp, the format is (COMPILER and ARCH must be UPPERCASE):
# Compiler: OMP_FLAGS_CPU_<COMPILER?>_<ARCH?>
# Linker: OMP_LINK_FLAGS_CPU_<COMPILER?>_<ARCH?>
set(OMP_FLAGS_CPU_FUJITSU
-Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16)
set(OMP_LINK_FLAGS_CPU_FUJITSU
-Kopenmp)
set(OMP_FLAGS_CPU_INTEL
-qopt-streaming-stores=always)
set(OMP_FLAGS_CPU_GNU_PPC64LE
-mcpu=native)
set(OMP_FLAGS_CPU_XL
-O5 -qarch=auto -qtune=auto)
# NEC
set(OMP_FLAGS_CPU_NEC -O4 -finline)
set(OMP_FLAGS_CPU_NEC -O4 -finline) # CMake doesn't detect this so it's meant to be chosen by register_flag_optional(ARCH)
register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that supports OpenMP as per CMake detection (and offloading if enabled with `OFFLOAD`)"
@ -122,6 +136,12 @@ macro(setup)
${ARCH}
)
register_append_compiler_and_arch_specific_link_flags(
OMP_LINK_FLAGS_CPU
${COMPILER}
${ARCH}
)
elseif ("${OFFLOAD}" STREQUAL ON)
# offload but with custom flags
register_definitions(OMP_TARGET_GPU)

View File

@ -20,7 +20,9 @@
#
macro(wipe_gcc_style_optimisation_flags VAR)
string(REGEX REPLACE "([\\/\\-]O.)" "" ${VAR} ${${VAR}})
if(${VAR})
string(REGEX REPLACE "([\\/\\-]O.)" "" ${VAR} ${${VAR}})
endif()
endmacro()
macro(register_link_library)
@ -41,7 +43,7 @@ macro(register_append_link_flags)
list(APPEND LINK_FLAGS ${ARGN})
endmacro()
macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH)
function(bind_cxx_and_arch OUT PREFIX CXX ARCH)
string(TOUPPER ${CXX} _CXX)
string(TOUPPER ${ARCH} _ARCH)
set(_CXX_ARCH_SPECIFIC_FLAGS "${${PREFIX}_${_CXX}_${_ARCH}}")
@ -52,6 +54,17 @@ macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH)
if (_CXX_ARCH_SPECIFIC_FLAGS)
register_append_cxx_flags(ANY ${_CXX_ARCH_SPECIFIC_FLAGS})
endif ()
set(${OUT} "${_CXX_ARCH_SPECIFIC_FLAGS}" PARENT_SCOPE)
endfunction()
macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH)
bind_cxx_and_arch(OUT ${PREFIX} ${CXX} ${ARCH})
register_append_cxx_flags(ANY ${OUT})
endmacro()
macro(register_append_compiler_and_arch_specific_link_flags PREFIX CXX ARCH)
bind_cxx_and_arch(OUT ${PREFIX} ${CXX} ${ARCH})
register_append_link_flags(${OUT})
endmacro()
macro(register_definitions)