From 00d0bc15ba2964c43218e1bc053d62137b6f6a81 Mon Sep 17 00:00:00 2001 From: Keichi Takahashi Date: Mon, 14 Feb 2022 13:37:50 +0900 Subject: [PATCH 01/24] Fix a bug in the CMake script where override flags are ignored --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6769952..54034ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,7 +125,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F # setup common build flag defaults if there are no overrides if (NOT DEFINED ${BUILD_TYPE}_FLAGS) set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS}) -elseif () +else () set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS}) endif () From 7b8a561f58f543ae80a3a9a06dfd27156c667698 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2022 12:20:10 +0100 Subject: [PATCH 02/24] Update preferred Citation in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index df95582..e4c2a15 100644 --- a/README.md +++ b/README.md @@ -163,11 +163,11 @@ Pull Requests should be made against the `develop` branch. Please cite BabelStream via this reference: -Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 +Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018. DOI: 10.1504/IJCSE.2018.095847 ### Other BabelStream publications -* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 +* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 * Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). From 7c3f7c9220368fc610a3a069734d51683d4f72e7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 12 Jun 2022 09:10:22 +0000 Subject: [PATCH 03/24] Bump crossbeam-utils from 0.8.5 to 0.8.8 in /src/rust/rust-stream Bumps [crossbeam-utils](https://github.com/crossbeam-rs/crossbeam) from 0.8.5 to 0.8.8. - [Release notes](https://github.com/crossbeam-rs/crossbeam/releases) - [Changelog](https://github.com/crossbeam-rs/crossbeam/blob/master/CHANGELOG.md) - [Commits](https://github.com/crossbeam-rs/crossbeam/compare/crossbeam-utils-0.8.5...crossbeam-utils-0.8.8) --- updated-dependencies: - dependency-name: crossbeam-utils dependency-type: indirect ... Signed-off-by: dependabot[bot] --- src/rust/rust-stream/Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index 5f225f0..cb86dab 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -136,9 +136,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.5" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" dependencies = [ "cfg-if", "lazy_static", @@ -426,7 +426,7 @@ dependencies = [ [[package]] name = "rust-stream" -version = "3.4.0" +version = "4.0.0" dependencies = [ "colour", "core_affinity", From 1d9cde42b00a42428b7e3b1043c0c1acc0af2b22 Mon Sep 17 00:00:00 2001 From: NoseKnowsAll Date: Wed, 20 Jul 2022 18:10:15 -0500 Subject: [PATCH 04/24] Reflect updated model options in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index df95582..7be3550 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust ``` #### Overriding default flags From 6945cbcec71be6ad09b2ea2a319169ab3dd99cf0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 16 Aug 2022 15:43:22 +0000 Subject: [PATCH 05/24] Fix RAJA CUDA build RAJA looses the source directory, so need to remind it where the BabelStream source is to make sure it treats the source as CUDA NB: The RAJA CUDA build might be broken at this time. See https://github.com/LLNL/RAJA/issues/1296 Workaround below worked for us https://github.com/LLNL/RAJA/pull/1302 --- src/raja/model.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/raja/model.cmake b/src/raja/model.cmake index 7e5f284..4da4af6 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -84,8 +84,8 @@ macro(setup) if (ENABLE_CUDA) # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu enable_language(CUDA) - set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(src/raja/RAJAStream.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA) endif () From 1b679996fcb0cdab2db00c2f9e326ea0cd4faa5a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 16 Aug 2022 15:45:11 +0000 Subject: [PATCH 06/24] update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 903cb02..854d1f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,8 @@ All notable changes to this project will be documented in this file. ## Unreleased -- None +### Changed +- RAJA CUDA CMake build issues resolved. ## [v4.0] - 2021-12-22 From 1d8e383a29f7a3fc98cdfd5b8082a3366c62675c Mon Sep 17 00:00:00 2001 From: Rob Jones <62852815+robj0nes@users.noreply.github.com> Date: Mon, 12 Sep 2022 10:58:47 +0100 Subject: [PATCH 07/24] In-package Kokkos builds Updating kokkos/model.cmake to allow for in-package builds (eg. Spack) --- src/kokkos/model.cmake | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 445991d..a95fdba 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,16 +1,17 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") -register_flag_required(KOKKOS_IN_TREE +register_flag_optional(KOKKOS_IN_TREE "Absolute path to the *source* distribution directory of Kokkos. Remember to append Kokkos specific flags as well, for example: - -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... + See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") - See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") +register_flag_optional(KOKKOS_IN_PACKAGE + "Use if Kokkos is part of a package dependency: + Path to package R-Path containing Kokkos libs" "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) @@ -20,13 +21,18 @@ macro(setup) set(CMAKE_CXX_STANDARD 14) cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") if (EXISTS "${KOKKOS_IN_TREE}") + message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) - else () - message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") + elseif (EXISTS "${KOKKOS_IN_PACKAGE}") + message(STATUS "Building using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") + find_package(Kokkos REQUIRED) + register_link_library(Kokkos::kokkos) + else() + message(FATAL_ERROR "Neither `${KOKKOS_IN_TREE}`, or `${KOKKOS_IN_PACKAGE}` exists") endif () register_append_compiler_and_arch_specific_cxx_flags( @@ -36,5 +42,3 @@ macro(setup) ) endmacro() - - From 407d6701dfb01d8ac3262cac2cc4dbcf4f2b590e Mon Sep 17 00:00:00 2001 From: Rob Jones <62852815+robj0nes@users.noreply.github.com> Date: Thu, 15 Sep 2022 11:32:23 +0100 Subject: [PATCH 08/24] In-package linking to RAJA Introduced RAJA_IN_PACKAGE to allow for linking to an in-package dependency of RAJA (eg. Spack) --- src/raja/model.cmake | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/raja/model.cmake b/src/raja/model.cmake index 4da4af6..b1e7750 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -1,18 +1,19 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" "c++") -register_flag_required(RAJA_IN_TREE +register_flag_optional(RAJA_IN_TREE "Absolute path to the *source* distribution directory of RAJA. Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: - -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options -") +" "") + +register_flag_optional(RAJA_IN_PACKAGE + "Use if Raja is part of a package dependency: + Path to installation" "") register_flag_optional(TARGET "Target offload device, implemented values are CPU, NVIDIA" @@ -76,8 +77,14 @@ macro(setup) register_link_library(RAJA) # RAJA's cmake screws with where the binary will end up, resetting it here: set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + + elseif (EXISTS "${RAJA_IN_PACKAGE}") + message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`") + find_package(RAJA REQUIRED) + register_link_library(RAJA) + else () - message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") + message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists") endif () From 9b550767978e5fe50057c6bb7f87e9bc7a973542 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 26 Sep 2022 03:41:45 -0700 Subject: [PATCH 09/24] accept NVHPC NVC++ as a CUDA compiler when it is so Signed-off-by: Jeff Hammond --- src/thrust/ThrustStream.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 3a57ab0..f15a392 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -145,7 +145,7 @@ T ThrustStream::dot() #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__) #define IMPL_FN__(fn) cuda ## fn #define IMPL_TYPE__(tpe) cuda ## tpe #elif defined(__HIP_PLATFORM_HCC__) From 1c46f8efd9f2bc4a31cca7c72902740d4af178e9 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 1 Oct 2022 04:56:03 +0800 Subject: [PATCH 10/24] Bump rust-stream dependencies --- src/rust/rust-stream/Cargo.lock | 668 +++++++++++++++++++++++++++----- src/rust/rust-stream/Cargo.toml | 18 +- 2 files changed, 580 insertions(+), 106 deletions(-) diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index 5f225f0..723849a 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -4,13 +4,131 @@ version = 3 [[package]] name = "ansi_term" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "async-channel" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "once_cell", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7" +dependencies = [ + "autocfg", + "concurrent-queue", + "futures-lite", + "libc", + "log", + "once_cell", + "parking", + "polling", + "slab", + "socket2", + "waker-fn", + "winapi 0.3.9", +] + +[[package]] +name = "async-lock" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-task" +version = "4.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524" + +[[package]] +name = "atomic-waker" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a" + [[package]] name = "atty" version = "0.2.14" @@ -24,9 +142,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -34,6 +152,38 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blocking" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc" +dependencies = [ + "async-channel", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "once_cell", +] + +[[package]] +name = "bumpalo" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" + +[[package]] +name = "cache-padded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + [[package]] name = "cfg-if" version = "1.0.0" @@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "2.33.3" +version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", @@ -64,6 +214,15 @@ dependencies = [ "crossterm", ] +[[package]] +name = "concurrent-queue" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +dependencies = [ + "cache-padded", +] + [[package]] name = "core_affinity" version = "0.5.10" @@ -78,9 +237,9 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ "cfg-if", "crossbeam-channel", @@ -92,9 +251,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -102,9 +261,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -113,22 +272,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.5" +version = "0.9.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" dependencies = [ + "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.2" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" dependencies = [ "cfg-if", "crossbeam-utils", @@ -136,12 +295,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.5" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac" dependencies = [ "cfg-if", - "lazy_static", ] [[package]] @@ -170,10 +328,157 @@ dependencies = [ ] [[package]] -name = "either" -version = "1.6.1" +name = "ctor" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "futures" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" + +[[package]] +name = "futures-executor" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" + +[[package]] +name = "futures-lite" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-macro" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" + +[[package]] +name = "futures-task" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gloo-timers" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] [[package]] name = "heck" @@ -202,6 +507,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -212,6 +526,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.108" +version = "0.2.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" +checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" [[package]] name = "lock_api" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", + "value-bag", ] [[package]] -name = "memoffset" -version = "0.6.4" +name = "memchr" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" dependencies = [ "autocfg", ] @@ -275,32 +606,44 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" dependencies = [ "winapi 0.3.9", ] [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", "libc", ] +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "parking" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" + [[package]] name = "parking_lot" version = "0.11.2" @@ -327,12 +670,29 @@ dependencies = [ ] [[package]] -name = "pest" -version = "2.1.3" +name = "pin-project-lite" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polling" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011" dependencies = [ - "ucd-trie", + "autocfg", + "cfg-if", + "libc", + "log", + "wepoll-ffi", + "winapi 0.3.9", ] [[package]] @@ -361,27 +721,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.32" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.10" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -391,31 +751,43 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "rstest" -version = "0.10.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc" +dependencies = [ + "async-std", + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116" dependencies = [ "cfg-if", "proc-macro2", @@ -426,7 +798,7 @@ dependencies = [ [[package]] name = "rust-stream" -version = "3.4.0" +version = "4.0.0" dependencies = [ "colour", "core_affinity", @@ -443,18 +815,18 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustversion" -version = "1.0.5" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "scopeguard" @@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semver" -version = "0.11.0" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "signal-hook" @@ -501,10 +861,29 @@ dependencies = [ ] [[package]] -name = "smallvec" -version = "1.7.0" +name = "slab" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" + +[[package]] +name = "socket2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +dependencies = [ + "libc", + "winapi 0.3.9", +] [[package]] name = "strsim" @@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ "clap", "lazy_static", @@ -538,20 +917,20 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.82" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] name = "tabular" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1" dependencies = [ "unicode-width", ] @@ -566,28 +945,32 @@ dependencies = [ ] [[package]] -name = "ucd-trie" -version = "0.1.3" +name = "unicode-ident" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" [[package]] name = "unicode-segmentation" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" +checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] -name = "unicode-xid" -version = "0.2.2" +name = "value-bag" +version = "1.0.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" +dependencies = [ + "ctor", + "version_check", +] [[package]] name = "vec_map" @@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wepoll-ffi" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb" +dependencies = [ + "cc", +] [[package]] name = "winapi" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml index 8ac456f..d93a84f 100644 --- a/src/rust/rust-stream/Cargo.toml +++ b/src/rust/rust-stream/Cargo.toml @@ -7,19 +7,19 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -num-traits = "0.2.14" -structopt = "0.3.13" -tabular = "0.1.4" -rayon = "1.5.1" -crossbeam = "0.8.1" -num_cpus = "1.13.0" -rustversion = "1.0" -libc = "0.2.97" +num-traits = "0.2.15" +structopt = "0.3.26" +tabular = "0.2.0" +rayon = "1.5.3" +crossbeam = "0.8.2" +num_cpus = "1.13.1" +rustversion = "1.0.9" +libc = "0.2.134" core_affinity = "0.5.10" colour = "0.6.0" [dev-dependencies] -rstest = "0.10.0" +rstest = "0.13.0" [build-dependencies] rustversion = "1.0" From a075455ad45f00be73b10cc54a8369397221c691 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Sat, 30 Apr 2022 21:59:45 -0500 Subject: [PATCH 11/24] Add tuned benchmark kernels Co-authored-by: Nick Curtis --- src/hip/HIPStream.cpp | 210 +++++++++++++++++++++++++++++++++--------- src/hip/HIPStream.h | 33 +++++++ src/hip/model.cmake | 15 ++- 3 files changed, 212 insertions(+), 46 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 6aed1ee..dcf634e 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -9,7 +9,32 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 + +#ifdef NONTEMPORAL +template +__device__ __forceinline__ T load(const T& ref) +{ + return __builtin_nontemporal_load(&ref); +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + __builtin_nontemporal_store(value, &ref); +} +#else +template +__device__ __forceinline__ T load(const T& ref) +{ + return ref; +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + ref = value; +} +#endif void check_error(void) { @@ -23,15 +48,27 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) + : array_size{ARRAY_SIZE}, + block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) { - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) + std::cerr << "Elements per lane: " << elements_per_lane << std::endl; + std::cerr << "Chunks per block: " << chunks_per_block << std::endl; + // The array size must be divisible by total number of elements + // moved per block for kernel launches + if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) { std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; + ss << "Array size must be a multiple of elements operated on per block (" + << TBSIZE * elements_per_lane * chunks_per_block + << ")."; throw std::runtime_error(ss.str()); } + std::cerr << "block count " << block_count << std::endl; + +#ifdef NONTEMPORAL + std::cerr << "Using non-temporal memory operations." << std::endl; +#endif // Set device int count; @@ -49,7 +86,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(block_count*sizeof(T)); // Check buffers fit on the device hipDeviceProp_t props; @@ -64,7 +101,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + hipMalloc(&d_sum, block_count*sizeof(T)); check_error(); } @@ -115,68 +152,115 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } - -template -__global__ void copy_kernel(const T * a, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void copy_kernel(const T * __restrict a, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void mul_kernel(T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - b[i] = scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); + } + } } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void add_kernel(const T * a, const T * b, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i] + b[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void triad_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] = b[i] + scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); + } + } } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -199,42 +283,78 @@ void HIPStream::nstream() check_error(); } -template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +template +struct Reducer +{ + template + __device__ + static + void reduce(I it) noexcept + { + if (n == 1) return; + +#if defined(__HIP_PLATFORM_NVCC__) + constexpr unsigned int warpSize = 32; +#endif + constexpr bool is_same_warp{n <= warpSize * 2}; + if (static_cast(threadIdx.x) < n/2) + { + it[threadIdx.x] += it[threadIdx.x + n/2]; + } + is_same_warp ? __threadfence_block() : __syncthreads(); + Reducer::reduce(it); + } +}; + +template<> +struct Reducer<1u> { + template + __device__ + static + void reduce(I) noexcept + {} +}; + +template +__launch_bounds__(TBSIZE) +__global__ +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) { __shared__ T tb_sum[TBSIZE]; + const size_t tidx = threadIdx.x; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; - - tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) - tb_sum[local_i] += a[i] * b[i]; - - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) + T tmp{0}; + for (size_t i = 0; i != chunks_per_block; ++i) { - __syncthreads(); - if (local_i < offset) + for (size_t j = 0; j != elements_per_lane; ++j) { - tb_sum[local_i] += tb_sum[local_i+offset]; + tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); } } + tb_sum[tidx] = tmp; + __syncthreads(); - if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + Reducer<>::reduce(tb_sum); + if (tidx) return; + store(tb_sum[0], sum[blockIdx.x]); } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_sum); check_error(); - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < block_count; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 44a2893..ecdf929 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -18,9 +18,42 @@ template class HIPStream : public Stream { +#ifdef __HIP_PLATFORM_NVCC__ + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 1 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 8 + #endif +#else + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 4 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 1 + #endif +#endif + // Make sure that either: + // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; + static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; + // Take into account the datatype size + // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int elements_per_lane{ + (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + protected: // Size of arrays int array_size; + int block_count; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4..3ffaf7a 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,19 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(USE_NONTEMPORAL_MEM + "Flag indicating to use non-temporal memory accesses to bypass cache." + "OFF") + +# TODO: Better flag descriptions +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) +register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) + macro(setup) - # nothing to do here as hipcc does everything correctly, what a surprise! + # Ensure we set the proper preprocessor directives + if (USE_NONTEMPORAL_MEM) + add_definitions(-DNONTEMPORAL) + endif () + register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From bcf8708f2c294187390e69d9b825b2e7dc709001 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Tue, 31 May 2022 11:29:42 -0500 Subject: [PATCH 12/24] Clean up kernels and drop unneeded modifications --- src/hip/HIPStream.cpp | 187 ++++++++++++------------------------------ src/hip/HIPStream.h | 7 -- src/hip/model.cmake | 12 +-- 3 files changed, 53 insertions(+), 153 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index dcf634e..eac77b4 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -10,31 +10,6 @@ #define TBSIZE 1024 -#ifdef NONTEMPORAL -template -__device__ __forceinline__ T load(const T& ref) -{ - return __builtin_nontemporal_load(&ref); -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - __builtin_nontemporal_store(value, &ref); -} -#else -template -__device__ __forceinline__ T load(const T& ref) -{ - return ref; -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - ref = value; -} -#endif void check_error(void) { @@ -49,27 +24,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) + block_count(array_size / (TBSIZE * elements_per_lane)) { std::cerr << "Elements per lane: " << elements_per_lane << std::endl; std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) + if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) { std::stringstream ss; ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane * chunks_per_block + << TBSIZE * elements_per_lane << ")."; throw std::runtime_error(ss.str()); } std::cerr << "block count " << block_count << std::endl; -#ifdef NONTEMPORAL - std::cerr << "Using non-temporal memory operations." << std::endl; -#endif - // Set device int count; hipGetDeviceCount(&count); @@ -86,7 +57,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(block_count*sizeof(T)); + hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; @@ -101,15 +73,14 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, block_count*sizeof(T)); - check_error(); } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -117,15 +88,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -152,26 +121,20 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void copy_kernel(const T * __restrict a, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_c); @@ -180,27 +143,21 @@ void HIPStream::copy() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_b, d_c); @@ -209,26 +166,20 @@ void HIPStream::mul() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -237,27 +188,21 @@ void HIPStream::add() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -266,91 +211,63 @@ void HIPStream::triad() check_error(); } -template -__global__ void nstream_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] += b[i] + scalar * c[i]; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -struct Reducer -{ - template - __device__ - static - void reduce(I it) noexcept - { - if (n == 1) return; - -#if defined(__HIP_PLATFORM_NVCC__) - constexpr unsigned int warpSize = 32; -#endif - constexpr bool is_same_warp{n <= warpSize * 2}; - if (static_cast(threadIdx.x) < n/2) - { - it[threadIdx.x] += it[threadIdx.x + n/2]; - } - is_same_warp ? __threadfence_block() : __syncthreads(); - Reducer::reduce(it); - } -}; - -template<> -struct Reducer<1u> { - template - __device__ - static - void reduce(I) noexcept - {} -}; - -template +template __launch_bounds__(TBSIZE) -__global__ -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - const size_t tidx = threadIdx.x; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; - const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - T tmp{0}; - for (size_t i = 0; i != chunks_per_block; ++i) + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; + + tb_sum[local_i] = 0.0; + for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { - for (size_t j = 0; j != elements_per_lane; ++j) + __syncthreads(); + if (local_i < offset) { - tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); + tb_sum[local_i] += tb_sum[local_i+offset]; } } - tb_sum[tidx] = tmp; - __syncthreads(); - Reducer<>::reduce(tb_sum); - if (tidx) return; - store(tb_sum[0], sum[blockIdx.x]); + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(block_count), dim3(TBSIZE), - 0, 0, d_a, d_b, d_sum); + 0, 0, d_a, d_b, sums, array_size); check_error(); - - hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); check_error(); T sum = 0.0; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index ecdf929..7bce0b5 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -22,16 +22,10 @@ class HIPStream : public Stream #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 1 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 8 - #endif #else #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 4 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 1 - #endif #endif // Make sure that either: // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element @@ -41,7 +35,6 @@ class HIPStream : public Stream (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), "DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 3ffaf7a..2f7d69e 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,19 +2,9 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(USE_NONTEMPORAL_MEM - "Flag indicating to use non-temporal memory accesses to bypass cache." - "OFF") - -# TODO: Better flag descriptions -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) -register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) # Ensure we set the proper preprocessor directives - if (USE_NONTEMPORAL_MEM) - add_definitions(-DNONTEMPORAL) - endif () register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) - register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From f98aedf64d0f62764550d93ee0f1458be2146efd Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:09:57 -0500 Subject: [PATCH 13/24] Use triple-chevron syntax for hip kernel launching --- src/hip/HIPStream.cpp | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index eac77b4..ce69172 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -103,7 +103,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); hipDeviceSynchronize(); check_error(); @@ -134,10 +134,7 @@ void copy_kernel(const T * __restrict a, T * __restrict c) template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -157,10 +154,7 @@ void mul_kernel(T * __restrict b, const T * __restrict c) template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -179,10 +173,7 @@ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -202,10 +193,7 @@ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -224,10 +212,7 @@ __global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -262,10 +247,7 @@ __global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * _ template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); From de93c06e78a7051cfed4a44626ac6fc599f5c24d Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:32:20 -0500 Subject: [PATCH 14/24] Add clarifying comment and further clean-up --- src/hip/HIPStream.cpp | 8 ++++---- src/hip/HIPStream.h | 1 - src/hip/model.cmake | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ce69172..37fce3b 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -27,8 +27,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) block_count(array_size / (TBSIZE * elements_per_lane)) { - std::cerr << "Elements per lane: " << elements_per_lane << std::endl; - std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) @@ -39,7 +37,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) << ")."; throw std::runtime_error(ss.str()); } - std::cerr << "block count " << block_count << std::endl; // Set device int count; @@ -56,7 +53,10 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; - // Allocate the host array for partial sums for dot kernels + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. + // This creates an array on the host which is visible to the device. However, it requires + // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host + // after it has been passed through to a kernel. hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); check_error(); diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 7bce0b5..305e937 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -55,7 +55,6 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; public: diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 2f7d69e..19e6fd0 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -5,6 +5,5 @@ register_flag_required(CMAKE_CXX_COMPILER register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) - # Ensure we set the proper preprocessor directives register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) endmacro() \ No newline at end of file From f44cd6fdd2bf434b91e40e0b117af0e6f05b578a Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Mon, 5 Sep 2022 15:43:37 -0700 Subject: [PATCH 15/24] Roll back modifications for copy, mul, add, and triad --- src/hip/HIPStream.cpp | 50 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 37fce3b..cc1d21f 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -124,17 +124,19 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector template __launch_bounds__(TBSIZE) __global__ -void copy_kernel(const T * __restrict a, T * __restrict c) +void copy_kernel(const T * a, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -143,18 +145,20 @@ void HIPStream::copy() template __launch_bounds__(TBSIZE) __global__ -void mul_kernel(T * __restrict b, const T * __restrict c) +void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - b[gidx + j] = scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + b[i] = scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -163,17 +167,19 @@ void HIPStream::mul() template __launch_bounds__(TBSIZE) __global__ -void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) +void add_kernel(const T * a, const T * b, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j] + b[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i] + b[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -182,18 +188,20 @@ void HIPStream::add() template __launch_bounds__(TBSIZE) __global__ -void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] = b[i] + scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -220,7 +228,7 @@ void HIPStream::nstream() template __launch_bounds__(TBSIZE) -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; From 85d80915f60272b08bd41d5baae561d392003ed3 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 8 Sep 2022 11:44:37 -0500 Subject: [PATCH 16/24] Simplify/roll back unneeded modifications --- src/hip/HIPStream.cpp | 88 ++++++++++++++++++------------------------- src/hip/HIPStream.h | 32 ++++++---------- src/hip/model.cmake | 4 +- 3 files changed, 49 insertions(+), 75 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index cc1d21f..7fc732d 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -23,17 +23,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane)) { - // The array size must be divisible by total number of elements - // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) { std::stringstream ss; - ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // The array size must be divisible by total number of elements + // moved per block for the dot kernel + if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) + { + std::stringstream ss; + ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" + << TBSIZE * dot_elements_per_lane << ")."; throw std::runtime_error(ss.str()); } @@ -52,12 +58,13 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; + dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host // after it has been passed through to a kernel. - hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); check_error(); // Check buffers fit on the device @@ -121,113 +128,90 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void copy_kernel(const T * a, T * c) +template +__global__ void copy_kernel(const T * a, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void mul_kernel(T * b, const T * c) +template +__global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void add_kernel(const T * a, const T * b, T * c) +template +__global__ void add_kernel(const T * a, const T * b, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void triad_kernel(T * a, const T * b, const T * c) +template +__global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - nstream_kernel<<>>(d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; @@ -236,7 +220,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) size_t i = blockDim.x * blockIdx.x + local_i; tb_sum[local_i] = 0.0; - for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) @@ -255,13 +239,13 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T HIPStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); T sum = 0.0; - for (int i = 0; i < block_count; i++) + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 305e937..3c603e0 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,39 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { -#ifdef __HIP_PLATFORM_NVCC__ - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 1 - #endif -#else - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 4 - #endif -#endif // Make sure that either: - // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element // or - // DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size - // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements // and 4 FP32 elements - static constexpr unsigned int elements_per_lane{ - (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; protected: // Size of arrays int array_size; - int block_count; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 19e6fd0..78150c4 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,8 +2,6 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) - macro(setup) - register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + # nothing to do here as hipcc does everything correctly, what a surprise! endmacro() \ No newline at end of file From 66491909e45d61c52cde88456681e6f6dcc9c0a3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 06:48:39 +0200 Subject: [PATCH 17/24] BabelStream Fortran This is a new implementation of BabelStream using Fortran. The code uses a Fortran driver that is largely equivalent to the C++ one, with a few exceptions. First, it does not use a C++ class for the stream object, since that doesn't seem like a useful way to do things in Fortran. Instead, I use a module that contains the same methods, and which has alloc and dealloc that act like CTOR and DTOR. The current implementations are: - DO CONCURRENT - Fortran array notation - Sequential DO loops - OpenACC parallel loop - OpenACC kernels on Fortran array notation - OpenMP parallel do - OpenMP taskloop - OpenMP target teams distribute parallel do simd - OpenMP target teams loop - CUDA Fortran (handwritten CUDA Fortran kernels, except DOT) - CUDA Fortran kernels (!$cuf kernel do <<<*,*>>>) I have tested with GCC, Intel (ifort and ifx), and NVHPC compilers on AArch64, x86_64 and NVIDIA GPU targets, although not exhaustively. Cray and Fujitsu have been tested as well. The only untested compiler of significance is IBM XLF. The current build system is GNU Make, and requires the user to manually specify the compiler and implementation. CSV printing is supported. Squashed commit of the following: commit 15f13ef9d326102cc003b2fdfe1b31c4aea55373 Author: Jeff Hammond Date: Tue Nov 15 06:42:46 2022 +0200 8 cores unless user changes commit 62ca680546ff89a1987b6fb797273038f767bf7b Author: Jeff Hammond Date: Tue Nov 15 06:42:09 2022 +0200 hoist and disable orin flags commit 76495509abcdb0686f293a72f7ded7c8ed7bb882 Author: Jeff Hammond Date: Tue Nov 15 06:40:13 2022 +0200 cleanup scripts commit 5b45df87954282cbb6b0f7eb2dcb3570d08bb5c2 Author: Jeff Hammond Date: Tue Nov 15 06:39:31 2022 +0200 add autopar flag for GCC commit 87eb07e4a8c3e8d6247ab5f72e14bf90002733ce Merge: a732e7c 270644e Author: Jeff Hammond Date: Wed Nov 9 15:53:41 2022 +0200 Merge remote-tracking branch 'origin/fortran_compiler_details' into fortran-ports commit a732e7c49e12ce8aff15e9d4bcbd215fa4a05d82 Merge: cfafd99 5697d94 Author: Jeff Hammond Date: Wed Nov 9 15:53:36 2022 +0200 Merge remote-tracking branch 'origin/fortran_int32_option' into fortran-ports commit cfafd993b646d5f5a90eb6d37d347cc545ab36d4 Merge: de5ff67 26a9707 Author: Jeff Hammond Date: Wed Nov 9 15:53:25 2022 +0200 Merge remote-tracking branch 'origin/fortran_csv' into fortran-ports commit de5ff6772b2036ad259a6a9c331ff5408146b54c Merge: 3109653 1d0755f Author: Jeff Hammond Date: Wed Nov 9 15:51:40 2022 +0200 Merge branch 'UoB-HPC:main' into fortran-ports commit 310965399a9b518122ff610b61419cdaab75ecd0 Author: Jeff Hammond Date: Mon Sep 26 03:39:01 2022 -0700 because gomp so confict commit 270644e6fb89e8f3c3bfe4d73c9896fc3094d761 Author: Jeff Hammond Date: Fri Sep 16 11:46:49 2022 +0300 add compiler info flag commit 5697d94a9ce5162de9445f5fde76f8020eae8b83 Author: Jeff Hammond Date: Sun Sep 4 13:59:57 2022 +0300 implement INT32 indexing commit 830ad58dd2c985b9a2425093c0eed9ec1c7887dd Author: Jeff Hammond Date: Sun Sep 4 13:49:17 2022 +0300 remove swear words from debugging commit 26a9707a1f09249d04206adf647587e42cf5fab5 Author: Jeff Hammond Date: Sun Sep 4 13:47:18 2022 +0300 add an option for giga/gibi-bytes commit 4f6d693c03ca1b092d3bf003cdfcc367b8ad86ac Author: Jeff Hammond Date: Sun Sep 4 13:41:32 2022 +0300 CSV output seems done Signed-off-by: Jeff Hammond commit 94e62be05c11b9ef208f7ad09402ddf26e4586ae Merge: ad52adc 772c183 Author: Jeff Hammond Date: Sun Sep 4 12:59:01 2022 +0300 Merge branch 'fortran_nan_check' into fortran_csv commit 772c183de2fb1a8ea72ae7ef3c45c17895c4fdc9 Author: Jeff Hammond Date: Sun Sep 4 10:44:26 2022 +0300 fixed NaN check commit ad52adc9ba6eb702c0fefdf1d9a8d1830b74830b Author: Jeff Hammond Date: Sun Sep 4 10:28:00 2022 +0300 CSV WIP commit 6f7cefc42ca286ae3b698d827fd7c9ee14984ecb Author: Jeff Hammond Date: Sun Sep 4 10:08:14 2022 +0300 update help output commit 208207597d150fafa059ca593ac30bc9a2e6d1a7 Author: Jeff Hammond Date: Sun Sep 4 10:02:24 2022 +0300 add option for cpu_time intrinsic timer also adjust use statements and rename macro for OpenMP timer Signed-off-by: Jeff Hammond commit 78fa2fcb1087f00efd94dd911000dc0d485da406 Author: Jeff Hammond Date: Tue Aug 30 17:19:36 2022 +0300 add check for normal (not NaN, not Inf, not denormal) the previous error check failed to detect garbage results because comparisons against NaN always return true. i flipped the logical comparison and added a check for IEEE normal to prevent this. it works on the case that was missed previously. Signed-off-by: Jeff Hammond commit 22fc9fe918a378f47c88dbad3ce91a4a6688789b Author: Jeff Hammond Date: Tue Aug 30 17:19:30 2022 +0300 move commit d2d8c8555d2665fc553f9263a6767843ec14def8 Author: Jeff Hammond Date: Tue Aug 30 16:29:15 2022 +0300 so far so good commit ffe181536b78ef845f861a09ca0dc72d4fffcbe8 Author: Jeff Hammond Date: Tue Aug 30 16:29:09 2022 +0300 so far so good commit aa72b46a8187792ca819f9720c032e802525413a Author: Jeff Hammond Date: Tue Aug 30 16:28:52 2022 +0300 GPU on by default commit 0fc9e4acdd0fbb5b6d9399962fc6a1daaa4a84da Author: Jeff Hammond Date: Thu Aug 25 16:38:08 2022 +0300 better commit b1cbd6f5b6a7534502d29e14d1c09fa6be378dd8 Merge: bf14601 5fe03c6 Author: Jeff Hammond Date: Thu Aug 25 16:35:22 2022 +0300 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit bf146011d6ee1ac9dd0cb6d43bb4e60b8cc37acf Author: Jeff Hammond Date: Thu Aug 25 16:35:07 2022 +0300 autodetect GPU arch in build (who needs CMake?) commit 5fe03c664e318a33bd0d383fddf8e76a2266a4e0 Author: Jeff Hammond Date: Thu Aug 25 15:57:41 2022 +0300 be smarter and check for compilers in path commit a187612a68447302fbd036d717df53b2780df3b4 Author: Jeff Hammond Date: Thu Aug 25 15:35:58 2022 +0300 remove samsung paths commit 82af886943a67980dda1724edae7686c6d280e1e Merge: a46bf6b 0f59b50 Author: Jeff Hammond Date: Wed Aug 24 13:22:13 2022 +0300 merge fix plus build updates commit 0f59b5014477c9a3da5eeb97328e6c55554a8c24 Author: Jeff Hammond Date: Wed Aug 24 08:43:19 2022 +0000 typo in USE_OPENMP_TIMERS commit 4a9a0019585b0f03c42f151042ad592cba03d8b3 Author: Jeff Hammond Date: Wed Aug 24 08:42:59 2022 +0000 logic fix commit 74d8123864fdb603b409112f5b9c0e92c2a93071 Author: Jeff Hammond Date: Wed Aug 24 03:05:58 2022 -0500 no-gpu option commit dc1e39ff34e384ae66f50ab787e9ca8c92701c3b Author: Jeff Hammond Date: Wed Aug 24 03:05:17 2022 -0500 fix default case commit 0b2b0e0bb754b0ac86dd16eeb30db092a3b3e658 Author: Jeff Hammond Date: Wed Aug 24 02:57:02 2022 -0500 fix tp for aarch64 commit 1e213bec76d2e7f5f161a18eb365f2948563c925 Author: Jeff Hammond Date: Wed Aug 24 07:46:41 2022 +0000 fix MARCH and build.sh elif commit a46bf6b48eb730a2fa08ccd8dddd04725fe25371 Author: Jeff Hammond Date: Tue Aug 23 16:43:22 2022 +0300 orin updates commit a9fe9c028c08b9f0d468ee56f24970817087099d Merge: 2ab14de 9f4bee4 Author: Jeff Hammond Date: Tue Aug 23 06:32:01 2022 -0700 more CPU specialization fixes commit 2ab14de1535f71fd1b548a10585b035ed88daa26 Author: Jeff Hammond Date: Tue Aug 23 06:30:37 2022 -0700 more CPU specialization fixes commit 9f4bee439c36b592321f4af38235450cfb23cdf2 Author: Jeff Hammond Date: Tue Aug 23 16:12:13 2022 +0300 build and run updates commit aeff0854478e5f16536b11034f459ea387a222a2 Author: Jeff Hammond Date: Tue Aug 23 15:56:25 2022 +0300 aesthetics commit 89b1ab01369cd71d5bbb837474799c75eabd64b5 Author: Jeff Hammond Date: Tue Aug 23 15:56:08 2022 +0300 handle march flag better commit a284bfa6da9bbb1aa9de5e8d40b74c316e90f3c6 Author: Jeff Hammond Date: Tue Aug 23 15:56:04 2022 +0300 handle march flag better commit c18c3945eb053581f2cdf528961f158c4aa66271 Author: Jeff Hammond Date: Tue Aug 23 15:53:11 2022 +0300 handle march flag better commit a3a8ccf453a2ff7cc99a774b5a6262648690f7c8 Author: Jeff Hammond Date: Tue Aug 23 05:29:41 2022 -0700 brewster updates commit 1364c4100f4bb6241e2db5805a64625a66c9d2fa Author: Tom Deakin Date: Sun Aug 21 17:16:20 2022 +0100 Add Fujitsu compiler flags commit b82fe2cb38cab940d0bebf613e22ea9685a21d06 Author: Jeff Hammond Date: Sun Aug 21 15:40:28 2022 +0300 FJ timer workaround commit c1b2fa81155c4d6a3717793c5670b1b0d4cf6101 Author: Jeff Hammond Date: Sun Aug 21 15:29:13 2022 +0300 intel update/fix commit 063ef879d9c3a3010a0be3b9baad7600f62e52bf Author: Jeff Hammond Date: Sun Aug 21 04:43:29 2022 -0700 NERSC AMD compiler commit 2c68292667b62f3428fc8cf4dfa874a5b44e625d Merge: 2bdbbe8 ca98948 Author: Jeff Hammond Date: Sun Aug 21 02:12:12 2022 -0700 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 2bdbbe81d782268fd7f48889fd6eeea32d5f1f58 Author: Jeff Hammond Date: Sun Aug 21 02:11:27 2022 -0700 AMD ROCM buikd commit ca9894801fdcca705e5d06c703af3a0f4e888c01 Author: Jeff Hammond Date: Sun Aug 21 09:10:16 2022 +0000 AWS stuff commit 4c539efda9522810dadc64c65339ce22ea6822b4 Author: Jeff Hammond Date: Sun Aug 21 09:09:59 2022 +0000 merge commit c3830658f8d403f602f3270b8f34b6ebd405c3e3 Author: Jeff Hammond Date: Sun Aug 21 02:08:46 2022 -0700 NERSC stuff commit 7d7f746206e1ace8753778fcd2416d5ae30b7470 Merge: 1fefb8e d929852 Author: Jeff Hammond Date: Sat Aug 20 20:56:09 2022 -0700 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 1fefb8e657764b43cbcaf63278e051ead53bd29a Author: Jeff Hammond Date: Sat Aug 20 20:55:16 2022 -0700 Cray temp stuff commit d92985239b31e16d478ca3a8a740baba2c35c164 Author: Jeff Hammond Date: Fri Aug 19 02:11:07 2022 -0700 Xeon stuff commit 3f19e451bbc856ed6aa221077e51bd0578e48426 Merge: 38f28e1 c8dd609 Author: Jeff Hammond Date: Thu Aug 18 13:56:37 2022 +0000 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 38f28e193c76970e5b6f641b437c6faefb9c608b Author: Jeff Hammond Date: Thu Aug 18 13:54:12 2022 +0000 TARGET for cpu too commit 6be181a07a93281a51cb897edf703404ead2c83e Author: Jeff Hammond Date: Thu Aug 18 13:52:58 2022 +0000 AWS flags commit e88479e09176510f707e410a4e69ea5290b2619e Author: Jeff Hammond Date: Thu Aug 18 13:52:42 2022 +0000 ARM stuff for AWS commit 1ee26cb3675b5e2739ddc21f56a1a864ff681950 Author: Jeff Hammond Date: Thu Aug 18 13:52:24 2022 +0000 disable shared for portability commit c8dd6099d95792b17abbcb025f771c3ae0ed773e Merge: 8bda56d 1b67999 Author: Jeff Hammond Date: Thu Aug 18 15:23:16 2022 +0300 Merge branch 'UoB-HPC:main' into fortran-ports commit 8bda56dd9053fdacc77aac572401bc4c7806efa0 Author: Jeff Hammond Date: Wed Aug 17 03:07:13 2022 -0700 add Cray compiler to build system - ignore temp files generated by Cray Fortran - workaround Cray not having reduce commit 3a0fec620d7ce5317a3260826087a26e0faee36c Author: Jeff Hammond Date: Wed Aug 17 02:09:19 2022 -0700 remove LOCAL, which causes problems commit e5a70ddbd995567c28a4c74373481c01a7489c88 Author: Jeff Hammond Date: Wed Aug 10 22:26:50 2022 +0300 add a way to use managed/device for everything DC uses managed by default. no way to not use it and be strictly standard right now. managed affects performance in some cases, so we want to compare apples-to-apples. thanks to Jeff Larkin for helping with this. Signed-off-by: Jeff Hammond commit 8fe956ab62737aecdec1ce7785a659587d814653 Author: Jeff Hammond Date: Wed Aug 10 22:26:41 2022 +0300 only do GPU flag for IFX commit de49723a7ae864847a2136353a45a49502291373 Author: Jeff Hammond Date: Wed Aug 10 22:26:23 2022 +0300 helper scripts commit e0971aa15d6fac2bc1de6e5080b53f7288975fe9 Author: Jeff Hammond Date: Wed Aug 10 22:26:21 2022 +0300 helper scripts commit a7ba50a60d321cab8e0f63d841b893c01a7df6b6 Author: Jeff Hammond Date: Wed Aug 10 12:29:28 2022 +0300 remove all the compiled intermediates with wildcard commit 31a594e82ec7b75d626639948eba532d503c4d81 Author: Jeff Hammond Date: Fri Aug 5 03:31:32 2022 -0700 build stuff commit 2cd3acd0f3cee82b60e5b05ac8dc01da3452bd1f Author: Jeff Hammond Date: Fri Aug 5 02:09:17 2022 -0700 build all with unique names commit ac230d127e15bdc9e56450862f9627d55da37f59 Author: Jeff Hammond Date: Fri Aug 5 09:28:03 2022 +0300 fix make clean commit bd0ef7736a43e26864167eb61345704731acbefa Author: Jeff Hammond Date: Fri Aug 5 09:24:12 2022 +0300 build check update commit 662520c4e443b841a88f1a4fe833bdb77b7cfd45 Author: Jeff Hammond Date: Fri Aug 5 09:21:48 2022 +0300 CUDA kernel version commit 25c321987b349f85a13f0140ae316382aa71e601 Author: Jeff Hammond Date: Fri Aug 5 09:15:32 2022 +0300 fixed CUDA Fortran dot commit 64612d2604401c2f200a3689b4248ecf7c93adaf Author: Jeff Hammond Date: Fri Aug 5 09:10:49 2022 +0300 CUDA Fortran working except DOT commit 4d35fe51a22978cc77bdd6311b7d15654856c564 Author: Jeff Hammond Date: Fri Aug 5 08:48:17 2022 +0300 CUDA Fortran is not compiling yet commit 0967c36695518a0c7bf7ee4c62a412f51338708e Author: Jeff Hammond Date: Fri Aug 5 07:50:40 2022 +0300 workshare commit 3ed69ea9ea655c364181144f21f0bfc0d3afa13c Author: Jeff Hammond Date: Fri Aug 5 07:42:49 2022 +0300 target loop commit 30dfb574c0c4435f09fc5a6e53644f9ab7fd95f3 Author: Jeff Hammond Date: Fri Aug 5 07:31:41 2022 +0300 OpenMP target commit a5306ce5c1144f38223074b786240db07a66b6bf Author: Jeff Hammond Date: Fri Aug 5 07:17:58 2022 +0300 makefile errors on non support commit 854c8135f5d80d5cecce22042d761a3f75a5ee13 Author: Jeff Hammond Date: Fri Aug 5 07:15:12 2022 +0300 fix taskloop commit f2894c583346410e14461988d68012d8469e583c Author: Jeff Hammond Date: Fri Aug 5 07:11:26 2022 +0300 add taskloop part 1 commit b7c0a43e9b49eed7ee54a4b4a8470118c092a922 Author: Jeff Hammond Date: Fri Aug 5 07:07:54 2022 +0300 add OpenMP traditional commit 7dafcc385f547738b9972f98b9e93f87e22d468c Author: Jeff Hammond Date: Fri Aug 5 07:02:36 2022 +0300 add OpenACC kernels + Array implementation commit 096e7d281015b09e5a099e5a1eb8b9b3e46cea5f Author: Jeff Hammond Date: Fri Aug 5 06:53:13 2022 +0300 formatting commit 284b62b47e508799dc49c85ea3d7a8d1f34f87a9 Author: Jeff Hammond Date: Thu Aug 4 19:41:27 2022 +0300 add placeholder for CSV commit 516bdd5929a13c17348040b031931485ca32e40e Author: Jeff Hammond Date: Thu Aug 4 19:14:00 2022 +0300 add --float commit d4e0ccaf6c00e6109e6130b3fd7c604df6feaa28 Author: Jeff Hammond Date: Thu Aug 4 19:13:23 2022 +0300 default message updates commit e8452f1c2e30fb84533b75a43ac9f5f265c96f60 Author: Jeff Hammond Date: Thu Aug 4 17:58:48 2022 +0300 list devices etc commit a80e82c323a5b0d1bffc524a8219de51cbdba8d2 Author: Jeff Hammond Date: Thu Aug 4 14:07:02 2022 +0300 better build system commit c3b090cf1f28641a9e34e331ab37cb055e82eec4 Author: Jeff Hammond Date: Thu Aug 4 14:03:27 2022 +0300 refactor build system commit 096cd43b7bc49751c17d686519620a7a4b1e5677 Author: Jeff Hammond Date: Thu Aug 4 13:43:17 2022 +0300 cleanup the rest commit 1e4fb8125e0729b32e8ec6d87f30d935310f55ca Author: Jeff Hammond Date: Thu Aug 4 13:40:38 2022 +0300 add Intel build and fix syntax issuse commit db3a9307b57bbc82456f9d52a6ff20d6e37b4083 Author: Jeff Hammond Date: Thu Aug 4 13:34:43 2022 +0300 use modern character syntax commit b66bd707d64a1823a3a7bd8a2c6acf30ce9043be Author: Jeff Hammond Date: Thu Aug 4 12:10:59 2022 +0300 printing commit ff842f62b952b5a61decfac80fd9b51dc56546d3 Author: Jeff Hammond Date: Thu Aug 4 11:06:43 2022 +0300 build stuff commit 05791085dd4cdde8b07f5b33a78ce051f4c8dd1d Author: Jeff Hammond Date: Wed Aug 3 20:10:33 2022 +0300 add OpenACC commit bb76b757a2765640b4a7bfb8d2d4850f96c478f7 Author: Jeff Hammond Date: Wed Aug 3 20:04:12 2022 +0300 better clean commit 2f53530d0f7f3d0cb4e138e2d76c325d81bbab8d Author: Jeff Hammond Date: Wed Aug 3 20:03:04 2022 +0300 Sequential loop Stream commit f5c0eaee60b04dfeabd96750c8b34694d2757f54 Author: Jeff Hammond Date: Wed Aug 3 19:56:54 2022 +0300 add array notation commit 76f836b1836b83006285ef69b4457abea39b400d Author: Jeff Hammond Date: Wed Aug 3 10:05:46 2022 +0300 implement BabelStream in Fortran 1. only DO CONCURRENT is supported right now. 2. the structure mostly matches C++ except we do not make a stream class. 3. there is no option for float versus double right now. it will be a compile-time choice later. Signed-off-by: Jeff Hammond --- .gitignore | 6 + src/.gitignore | 4 +- src/fortran/ArrayStream.F90 | 105 ++++ src/fortran/BabelStreamTypes.F90 | 21 + src/fortran/CUDAKernelStream.F90 | 230 +++++++++ src/fortran/CUDAStream.F90 | 309 ++++++++++++ src/fortran/DoConcurrentStream.F90 | 139 ++++++ src/fortran/Makefile | 109 ++++ src/fortran/OpenACCArrayStream.F90 | 144 ++++++ src/fortran/OpenACCStream.F90 | 161 ++++++ src/fortran/OpenMPStream.F90 | 137 +++++ src/fortran/OpenMPTargetLoopStream.F90 | 162 ++++++ src/fortran/OpenMPTargetStream.F90 | 163 ++++++ src/fortran/OpenMPTaskloopStream.F90 | 169 +++++++ src/fortran/OpenMPWorkshareStream.F90 | 120 +++++ src/fortran/SequentialStream.F90 | 130 +++++ src/fortran/build.sh | 54 ++ src/fortran/main.F90 | 666 +++++++++++++++++++++++++ src/fortran/make.inc.amd | 25 + src/fortran/make.inc.arm | 39 ++ src/fortran/make.inc.cray | 18 + src/fortran/make.inc.fj | 21 + src/fortran/make.inc.gcc | 33 ++ src/fortran/make.inc.nvhpc | 70 +++ src/fortran/make.inc.oneapi | 32 ++ src/fortran/run.sh | 35 ++ 26 files changed, 3101 insertions(+), 1 deletion(-) create mode 100644 src/fortran/ArrayStream.F90 create mode 100644 src/fortran/BabelStreamTypes.F90 create mode 100644 src/fortran/CUDAKernelStream.F90 create mode 100644 src/fortran/CUDAStream.F90 create mode 100644 src/fortran/DoConcurrentStream.F90 create mode 100644 src/fortran/Makefile create mode 100644 src/fortran/OpenACCArrayStream.F90 create mode 100644 src/fortran/OpenACCStream.F90 create mode 100644 src/fortran/OpenMPStream.F90 create mode 100644 src/fortran/OpenMPTargetLoopStream.F90 create mode 100644 src/fortran/OpenMPTargetStream.F90 create mode 100644 src/fortran/OpenMPTaskloopStream.F90 create mode 100644 src/fortran/OpenMPWorkshareStream.F90 create mode 100644 src/fortran/SequentialStream.F90 create mode 100755 src/fortran/build.sh create mode 100644 src/fortran/main.F90 create mode 100644 src/fortran/make.inc.amd create mode 100644 src/fortran/make.inc.arm create mode 100644 src/fortran/make.inc.cray create mode 100644 src/fortran/make.inc.fj create mode 100644 src/fortran/make.inc.gcc create mode 100644 src/fortran/make.inc.nvhpc create mode 100644 src/fortran/make.inc.oneapi create mode 100755 src/fortran/run.sh diff --git a/.gitignore b/.gitignore index 012d0e8..59ea5db 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,18 @@ sycl-stream hip-stream tbb-stream +src/fortran/BabelStream +src/fortran/BabelStream.* + *.o *.bc *.sycl *.tar *.gz *.a +*.mod +*.cub +*.ptx KokkosCore_config.* diff --git a/src/.gitignore b/src/.gitignore index 568a953..9d8b17b 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -16,6 +16,8 @@ **/*.gz **/*.a +**/*.swp + **/KokkosCore_Config_* **/.DS_Store @@ -26,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90 new file mode 100644 index 0000000..5a8d5bc --- /dev/null +++ b/src/fortran/ArrayStream.F90 @@ -0,0 +1,105 @@ +module ArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=5), parameter :: implementation_name = "Array" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + A = initA + B = initB + C = initC + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + h_A = A + h_B = B + h_C = C + end subroutine read_arrays + + subroutine copy() + implicit none + C = A + end subroutine copy + + subroutine add() + implicit none + C = A + B + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + B = scalar * C + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = B + scalar * C + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = A + B + scalar * C + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + s = dot_product(A,B) + end function dot + +end module ArrayStream diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90 new file mode 100644 index 0000000..dd01d35 --- /dev/null +++ b/src/fortran/BabelStreamTypes.F90 @@ -0,0 +1,21 @@ +module BabelStreamTypes + use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 + + implicit none + +#ifdef USE_FLOAT + integer, parameter :: StreamRealKind = REAL32 + character(len=6) :: StreamRealName = "REAL32" +#else + integer, parameter :: StreamRealKind = REAL64 + character(len=6) :: StreamRealName = "REAL64" +#endif + +#ifdef USE_INT32 +#warning There is no checking for overflowing INT32, so be careful. + integer, parameter :: StreamIntKind = INT32 +#else + integer, parameter :: StreamIntKind = INT64 +#endif + +end module BabelStreamTypes diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90 new file mode 100644 index 0000000..01668ea --- /dev/null +++ b/src/fortran/CUDAKernelStream.F90 @@ -0,0 +1,230 @@ +module CUDAKernelStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "CUDAKernel" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + B(i) = scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer(kind=StreamIntKind) :: i + integer :: err + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAKernelStream diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90 new file mode 100644 index 0000000..208f1aa --- /dev/null +++ b/src/fortran/CUDAStream.F90 @@ -0,0 +1,309 @@ +module CUDAFortranKernels + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + contains + + attributes(global) subroutine do_copy(n,A,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + endif + end subroutine do_copy + + attributes(global) subroutine do_add(n,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + B(i) + endif + end subroutine do_add + + attributes(global) subroutine do_mul(n,scalar,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: B(n) + real(kind=REAL64), intent(in) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + B(i) = scalar * C(i) + endif + end subroutine do_mul + + attributes(global) subroutine do_triad(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = B(i) + scalar * C(i) + endif + end subroutine do_triad + + attributes(global) subroutine do_nstream(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine do_nstream + +#if 0 + attributes(global) subroutine do_dot(n,A,B,r) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: r + integer(kind=StreamIntKind) :: i + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + end subroutine do_dot +#endif + +end module CUDAFortranKernels + +module CUDAStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + use cudafor, only: dim3 + + implicit none + + character(len=4), parameter :: implementation_name = "CUDA" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + type(dim3) :: grid, tblock + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + ! move to separate subroutine later + tblock = dim3(128,1,1) + grid = dim3(ceiling(real(N)/tblock%x),1,1) + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use CUDAFortranKernels, only: do_copy + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_copy<<>>(N, A, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use CUDAFortranKernels, only: do_add + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_add<<>>(N, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use CUDAFortranKernels, only: do_mul + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_mul<<>>(N, scalar, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use CUDAFortranKernels, only: do_triad + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_triad<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use CUDAFortranKernels, only: do_nstream + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_nstream<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + !use CUDAFortranKernels, only: do_dot + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer :: err + integer(kind=StreamIntKind) :: i + !call do_dot<<>>(N, B, C, r) + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAStream diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90 new file mode 100644 index 0000000..781210d --- /dev/null +++ b/src/fortran/DoConcurrentStream.F90 @@ -0,0 +1,139 @@ +module DoConcurrentStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "DoConcurrent" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_DEVICE + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,C) + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(B,C) + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + ! reduction omitted because NVF infers it and other compilers do not support + s = real(0,kind=REAL64) +#ifdef CRAY_THREAD_DOCONCURRENT + do i=1,N +#else + do concurrent (i=1:N) !shared(A,B) +#endif + s = s + A(i) * B(i) + end do + end function dot + +end module DoConcurrentStream diff --git a/src/fortran/Makefile b/src/fortran/Makefile new file mode 100644 index 0000000..18685d4 --- /dev/null +++ b/src/fortran/Makefile @@ -0,0 +1,109 @@ +ifeq ($(COMPILER),nvhpc) + include make.inc.nvhpc +else ifeq ($(COMPILER),oneapi) + include make.inc.oneapi +else ifeq ($(COMPILER),gcc) + include make.inc.gcc +else ifeq ($(COMPILER),amd) + include make.inc.amd +else ifeq ($(COMPILER),arm) + include make.inc.arm +else ifeq ($(COMPILER),cray) + include make.inc.cray +else ifeq ($(COMPILER),fj) + include make.inc.fj +else + $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.) + include make.inc.gcc + COMPILER=gcc +endif + +FCFLAGS += -DVERSION_STRING="4.0" +#FCFLAGS += -DUSE_INT32 + +ifeq ($(IMPLEMENTATION),DoConcurrent) + FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG) + IMPLEMENTATION_OBJECT = DoConcurrentStream.o + +else ifeq ($(IMPLEMENTATION),Array) + FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG) + IMPLEMENTATION_OBJECT = ArrayStream.o + +else ifeq ($(IMPLEMENTATION),OpenMP) + FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPWorkshare) + FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTarget) + FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop) + FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o + +else ifeq ($(IMPLEMENTATION),OpenACC) + FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCStream.o + +else ifeq ($(IMPLEMENTATION),OpenACCArray) + FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCArrayStream.o + +else ifeq ($(IMPLEMENTATION),CUDA) + FCFLAGS += -DUSE_CUDA $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAStream.o + +else ifeq ($(IMPLEMENTATION),CUDAKernel) + FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAKernelStream.o + +else ifeq ($(IMPLEMENTATION),Sequential) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION_OBJECT = SequentialStream.o + +else + $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION=Sequential + IMPLEMENTATION_OBJECT = SequentialStream.o + +endif + +all: BabelStream.$(COMPILER).$(IMPLEMENTATION) + +BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT) + $(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@ + +BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90 + $(FC) $(FCFLAGS) -c $< + +%.o: %.F90 BabelStreamTypes.mod + $(FC) $(FCFLAGS) -c $< + +clean: + -rm -f main.o BabelStreamUtil.mod babelstreamutil.mod + -rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod + -rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod + -rm -f ArrayStream.o ArrayStream.mod arraystream.mod + -rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod + -rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod + -rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod + -rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod + -rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod + -rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod + -rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod + -rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod + -rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod + -rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod + -rm -f *.modmic *.mod *.o *.cub *.ptx + +realclean: clean + -rm -f BabelStream.* diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90 new file mode 100644 index 0000000..9225fe7 --- /dev/null +++ b/src/fortran/OpenACCArrayStream.F90 @@ -0,0 +1,144 @@ +module OpenACCArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenACCArray" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$acc kernels + A = initA + B = initB + C = initC + !$acc end kernels + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$acc kernels + h_A = A + h_B = B + h_C = C + !$acc end kernels + end subroutine read_arrays + + subroutine copy() + implicit none + !$acc kernels + C = A + !$acc end kernels + end subroutine copy + + subroutine add() + implicit none + !$acc kernels + C = A + B + !$acc end kernels + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + B = scalar * C + !$acc end kernels + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = B + scalar * C + !$acc end kernels + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = A + B + scalar * C + !$acc end kernels + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$acc kernels + s = dot_product(A,B) + !$acc end kernels + end function dot + +end module OpenACCArrayStream diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90 new file mode 100644 index 0000000..7326f38 --- /dev/null +++ b/src/fortran/OpenACCStream.F90 @@ -0,0 +1,161 @@ +module OpenACCStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=7), parameter :: implementation_name = "OpenACC" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$acc parallel loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenACCStream diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90 new file mode 100644 index 0000000..7316d5b --- /dev/null +++ b/src/fortran/OpenMPStream.F90 @@ -0,0 +1,137 @@ +module OpenMPStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=6), parameter :: implementation_name = "OpenMP" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPStream diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90 new file mode 100644 index 0000000..9684ced --- /dev/null +++ b/src/fortran/OpenMPTargetLoopStream.F90 @@ -0,0 +1,162 @@ +module OpenMPTargetLoopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=16), parameter :: implementation_name = "OpenMPTargetLoop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetLoopStream diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90 new file mode 100644 index 0000000..0206d78 --- /dev/null +++ b/src/fortran/OpenMPTargetStream.F90 @@ -0,0 +1,163 @@ +module OpenMPTargetStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenMPTarget" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams distribute parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + end do + !$omp barrier + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams distribute parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetStream diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90 new file mode 100644 index 0000000..579a761 --- /dev/null +++ b/src/fortran/OpenMPTaskloopStream.F90 @@ -0,0 +1,169 @@ +module OpenMPTaskloopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=14), parameter :: implementation_name = "OpenMPTaskloop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + !$omp end master + !$omp end parallel + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + !$omp end master + !$omp end parallel + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + end do + !$omp end master + !$omp end parallel + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + B(i) + end do + !$omp end master + !$omp end parallel + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + B(i) = scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel + !$omp master + !$omp taskloop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + !$omp end master + !$omp end parallel + end function dot + +end module OpenMPTaskloopStream diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90 new file mode 100644 index 0000000..fd50f86 --- /dev/null +++ b/src/fortran/OpenMPWorkshareStream.F90 @@ -0,0 +1,120 @@ +module OpenMPWorkshareStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=15), parameter :: implementation_name = "OpenMPWorkshare" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$omp parallel workshare + A = initA + B = initB + C = initC + !$omp end parallel workshare + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$omp parallel workshare + h_A = A + h_B = B + h_C = C + !$omp end parallel workshare + end subroutine read_arrays + + subroutine copy() + implicit none + !$omp parallel workshare + C = A + !$omp end parallel workshare + end subroutine copy + + subroutine add() + implicit none + !$omp parallel workshare + C = A + B + !$omp end parallel workshare + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + B = scalar * C + !$omp end parallel workshare + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = B + scalar * C + !$omp end parallel workshare + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = A + B + scalar * C + !$omp end parallel workshare + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$omp parallel workshare + s = dot_product(A,B) + !$omp end parallel workshare + end function dot + +end module OpenMPWorkshareStream diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90 new file mode 100644 index 0000000..a8f6917 --- /dev/null +++ b/src/fortran/SequentialStream.F90 @@ -0,0 +1,130 @@ +module SequentialStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "Sequential" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module SequentialStream diff --git a/src/fortran/build.sh b/src/fortran/build.sh new file mode 100755 index 0000000..9343354 --- /dev/null +++ b/src/fortran/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# uncomment to disable GPU targets +#HAS_GPU=0 + +# Orin +#if [ "x${compiler}" == "xgcc" ] ; then +# export MCPU=cortex-a78ae +#fi +#if [ "x${compiler}" == "xarm" ] ; then +# export MCPU=cortex-a78 +#fi + +COMPILERS="gcc" +if [ $(which nvfortran) ] ; then + COMPILERS="${COMPILERS} nvhpc" +fi +if [ $(which crayftn) ] ; then + COMPILERS="${COMPILERS} cray" +fi +if [ $(uname -m) == "aarch64" ] ; then + if [ $(which armflang) ] ; then + COMPILERS="${COMPILERS} arm" + fi + if [ $(which frt) ] ; then + COMPILERS="${COMPILERS} fj" + fi +elif [ $(uname -m) == "x86_64" ] ; then + if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then + COMPILERS="${COMPILERS} oneapi" + if [ -f /opt/intel/oneapi/setvars.sh ] ; then + . /opt/intel/oneapi/setvars.sh >& /dev/null + fi + else + # ^ this detection can be improved + COMPILERS="${COMPILERS} amd" + fi +fi + +for compiler in ${COMPILERS} ; do + TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" + if [ "${HAS_GPU}" != "0" ] ; then + TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" + if [ "x${compiler}" == "xnvhpc" ] ; then + TARGETS="${TARGETS} CUDA CUDAKernel" + fi + fi + if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then + TARGETS="${TARGETS} OpenACC OpenACCArray" + fi + for implementation in ${TARGETS} ; do + make COMPILER=${compiler} IMPLEMENTATION=${implementation} + done +done diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 new file mode 100644 index 0000000..d86e8d4 --- /dev/null +++ b/src/fortran/main.F90 @@ -0,0 +1,666 @@ +module BabelStreamUtil + use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 + use BabelStreamTypes + + implicit none + + integer(kind=StreamIntKind) :: array_size = 33554432 + integer(kind=StreamIntKind) :: num_times = 100 + logical :: mibibytes = .false. + logical :: use_gigs = .false. + logical :: csv = .false. + character(len=1), parameter :: csv_sep = "," + + ! 1 = All + ! 2 = Triad + ! 3 = Nstream + integer :: selection = 1 + + real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64) + real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64) + real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64) + real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64) + + contains + + function get_wtime() result(t) +#if defined(USE_OMP_GET_WTIME) + use omp_lib + implicit none + real(kind=REAL64) :: t + t = omp_get_wtime() +#elif defined(USE_CPU_TIME) + implicit none + real(kind=REAL64) :: t + real :: r + call cpu_time(r) + t = r +#else + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +#endif + end function get_wtime + + subroutine parseArguments() + use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream, only: list_devices, set_device +#elif defined(USE_ARRAY) + use ArrayStream, only: list_devices, set_device +#elif defined(USE_OPENMP) + use OpenMPStream, only: list_devices, set_device +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream, only: list_devices, set_device +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream, only: list_devices, set_device +#elif defined(USE_OPENACC) + use OpenACCStream, only: list_devices, set_device +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream, only: list_devices, set_device +#elif defined(USE_CUDA) + use CUDAStream, only: list_devices, set_device +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream, only: list_devices, set_device +#elif defined(USE_SEQUENTIAL) + use SequentialStream, only: list_devices, set_device +#endif + implicit none + integer :: i, argc + integer :: arglen,err,pos(2) + character(len=64) :: argtmp + argc = command_argument_count() + do i=1,argc + call get_command_argument(i,argtmp,arglen,err) + if (err.eq.0) then + ! + ! list devices + ! + pos(1) = index(argtmp,"--list") + if (pos(1).eq.1) then + call list_devices() + stop + endif + ! + ! set device number + ! + pos(1) = index(argtmp,"--device") + if (pos(1).eq.1) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + stop + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer :: dev + read(argtmp,'(i15)') dev + call set_device(dev) + end block + endif + cycle + endif + ! + ! array size + ! + pos(1) = index(argtmp,"--arraysize") + pos(2) = index(argtmp,"-s") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer(kind=INT64) :: big_size + read(argtmp,'(i15)') big_size + if (big_size .gt. HUGE(array_size)) then + print*,'Array size does not fit into integer:' + print*,big_size,'>',HUGE(array_size) + print*,'Stop using USE_INT32' + stop + else + array_size = INT(big_size,kind=StreamIntKind) + endif + end block + endif + cycle + endif + ! + ! number of iterations + ! + pos(1) = index(argtmp,"--numtimes") + pos(2) = index(argtmp,"-n") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + read(argtmp,'(i15)') num_times + if (num_times.lt.2) then + write(*,'(a)') "Number of times must be 2 or more" + stop + end if + endif + cycle + endif + ! + ! precision + ! + pos(1) = index(argtmp,"--float") + if (pos(1).eq.1) then + write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", & + "to run BabelStream in single precision." + stop + endif + ! + ! selection (All, Triad, Nstream) + ! + pos(1) = index(argtmp,"--triad-only") + if (pos(1).eq.1) then + selection = 2 + cycle + endif + pos(1) = index(argtmp,"--nstream-only") + if (pos(1).eq.1) then + selection = 3 + cycle + endif + ! + ! CSV + ! + pos(1) = index(argtmp,"--csv") + if (pos(1).eq.1) then + csv = .true. + !write(*,'(a39)') "Sorry, CSV support isn't available yet." + !stop + endif + ! + ! units + ! + pos(1) = index(argtmp,"--mibibytes") + if (pos(1).eq.1) then + mibibytes = .true. + cycle + endif + ! + ! giga/gibi instead of mega/mebi + ! + pos(1) = index(argtmp,"--gigs") + if (pos(1).eq.1) then + use_gigs = .true. + cycle + endif + ! + ! + ! + pos(1) = index(argtmp,"--compiler-info") + if (pos(1).eq.1) then + write(*,'(a)') 'Compiler version: ',compiler_version() + write(*,'(a)') 'Compiler options: ',compiler_options() + stop + endif + ! + ! help + ! + pos(1) = index(argtmp,"--help") + pos(2) = index(argtmp,"-h") + if (any(pos(:).eq.1) ) then + call get_command_argument(0,argtmp,arglen,err) + write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]" + write(*,'(a)') "Options:" + write(*,'(a)') " -h --help Print the message" + write(*,'(a)') " --list List available devices" + write(*,'(a)') " --device INDEX Select device at INDEX" + write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array" + write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)" + !write(*,'(a)') " --float Use floats (rather than doubles)" + write(*,'(a)') " --triad-only Only run triad" + write(*,'(a)') " --nstream-only Only run nstream" + write(*,'(a)') " --csv Output as csv table" + write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB" + write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit." + stop + endif + end if + end do + end subroutine parseArguments + + subroutine run_all(timings, summ) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64), intent(out) :: summ + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call copy() + t2 = get_wtime() + timings(1,i) = t2-t1 + + t1 = get_wtime() + call mul(startScalar) + t2 = get_wtime() + timings(2,i) = t2-t1 + + t1 = get_wtime() + call add() + t2 = get_wtime() + timings(3,i) = t2-t1 + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(4,i) = t2-t1 + + t1 = get_wtime() + summ = dot() + t2 = get_wtime() + timings(5,i) = t2-t1 + + end do + + end subroutine run_all + + subroutine run_triad(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_triad + + subroutine run_nstream(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call nstream(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_nstream + + subroutine check_solution(A, B, C, summ) + use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal + implicit none + real(kind=REAL64), intent(in) :: A(:), B(:), C(:) + real(kind=REAL64), intent(in) :: summ + + integer(kind=StreamIntKind) :: i + real(kind=REAL64) :: goldA, goldB, goldC, goldSum + real(kind=REAL64) :: scalar + + ! always use double because of accumulation error + real(kind=REAL64) :: errA, errB, errC, errSum, epsi + logical :: cleanA, cleanB, cleanC, cleanSum + + goldA = startA + goldB = startB + goldC = startC + goldSum = 0.0d0 + + scalar = startScalar + + do i=1,num_times + + if (selection.eq.1) then + goldC = goldA + goldB = scalar * goldC + goldC = goldA + goldB + goldA = goldB + scalar * goldC + else if (selection.eq.2) then + goldA = goldB + scalar * goldC + else if (selection.eq.3) then + goldA = goldA + goldB + scalar * goldC; + endif + + end do + + goldSum = goldA * goldB * array_size + + cleanA = ALL(IEEE_Is_Normal(A)) + cleanB = ALL(IEEE_Is_Normal(B)) + cleanC = ALL(IEEE_Is_Normal(C)) + cleanSum = IEEE_Is_Normal(summ) + + if (.not. cleanA) then + write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanB) then + write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanC) then + write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanSum) then + write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ + end if + + errA = SUM( ABS( A - goldA ) ) / array_size + errB = SUM( ABS( B - goldB ) ) / array_size + errC = SUM( ABS( C - goldC ) ) / array_size + errSum = ABS( (summ - goldSum) / goldSum) + + epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0 + + if (errA .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA + end if + if (errB .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB + end if + if (errC .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC + end if + + if (selection.eq.1) then + if (errSum .gt. 1.0e-8) then + write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum + write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum + end if + endif + + end subroutine check_solution + +end module BabelStreamUtil + +program BabelStream + use BabelStreamUtil +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + integer :: element_size, err + real(kind=REAL64) :: scaling + character(len=3) :: label + real(kind=REAL64), allocatable :: timings(:,:) + real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) + real(kind=REAL64) :: summ + + call parseArguments() + + element_size = storage_size(real(0,kind=StreamRealKind)) / 8 + + if (mibibytes) then + if (use_gigs) then + scaling = 2.0d0**(-30) + label = "GiB" + else + scaling = 2.0d0**(-20) + label = "MiB" + endif + else + if (use_gigs) then + scaling = 1.0d-9 + label = "GB" + else + scaling = 1.0d-6 + label = "MB" + endif + endif + + if (.not.csv) then + + write(*,'(a)') "BabelStream Fortran" + write(*,'(a9,f4.1)') "Version: ", VERSION_STRING + write(*,'(a16,a)') "Implementation: ", implementation_name + + block + character(len=32) :: printout + write(printout,'(i9,1x,a5)') num_times,'times' + write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout) + end block + write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName) + + write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label + write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label + + endif ! csv + + allocate( timings(5,num_times) ) + + call alloc(array_size) + + call init_arrays(startA, startB, startC) + summ = 0.0d0 + + timings = -1.0d0 + if (selection.eq.1) then + call run_all(timings, summ) + else if (selection.eq.2) then + call run_triad(timings) + else if (selection.eq.3) then + call run_nstream(timings) + endif + + allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + + call read_arrays(h_A, h_B, h_C) + call check_solution(h_A, h_B, h_C, summ) + + block + character(len=20) :: printout(8) + real(kind=REAL64) :: tmin,tmax,tavg,nbytes + + if (csv) then + write(*,'(a,a1)',advance='no') 'function', csv_sep + write(*,'(a,a1)',advance='no') 'num_times', csv_sep + write(*,'(a,a1)',advance='no') 'n_elements',csv_sep + write(*,'(a,a1)',advance='no') 'sizeof', csv_sep + if (mibibytes) then + write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep + else + write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep + endif + write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep + write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep + write(*,'(a,a1)',advance='yes') 'avg_runtime' + else + write(printout(1),'(a8)') 'Function' + write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec' + write(printout(3),'(a9)') 'Min (sec)' + write(printout(4),'(a3)') 'Max' + write(printout(5),'(a7)') 'Average' + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif ! csv + + if (selection.eq.1) then + block + integer, parameter :: sizes(5) = [2,2,3,3,2] + character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "] + integer :: i + do i=1,5 + tmin = MINVAL(timings(i,2:num_times)) + tmax = MAXVAL(timings(i,2:num_times)) + tavg = SUM(timings(i,2:num_times)) / (num_times-1) + nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i) + write(printout(1),'(a)') labels(i) + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + enddo + end block + else if ((selection.eq.2).or.(selection.eq.3)) then + tmin = MINVAL(timings(1,2:num_times)) + tmax = MAXVAL(timings(1,2:num_times)) + tavg = SUM(timings(1,2:num_times)) / (num_times-1) + if (selection.eq.2) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 3 + write(printout(1),'(a12)') "Triad" + else if (selection.eq.3) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 4 + write(printout(1),'(a12)') "Nstream" + endif + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + endif + end block + + call dealloc() + +end program BabelStream diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd new file mode 100644 index 0000000..a863de8 --- /dev/null +++ b/src/fortran/make.inc.amd @@ -0,0 +1,25 @@ +FC := /opt/rocm/llvm/bin/flang +FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang +FCFLAGS := -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +ifdef MARCH +FCFLAGS += -march=$(MARCH) +else +FCFLAGS += -march=native +endif + +DOCONCURRENT_FLAG = -fopenmp # libomp.so required +ARRAY_FLAG = -fopenmp # libomp.so required +OPENMP_FLAG = -fopenmp +#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm new file mode 100644 index 0000000..a3e2a67 --- /dev/null +++ b/src/fortran/make.inc.arm @@ -0,0 +1,39 @@ +FC = armflang +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -fopenmp +ARRAY_FLAG = -fopenmp +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray new file mode 100644 index 0000000..dae4e75 --- /dev/null +++ b/src/fortran/make.inc.cray @@ -0,0 +1,18 @@ +FC := ftn +FCFLAGS = -e F -O3 + +DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT +ARRAY_FLAG = -h autothread +OPENMP_FLAG = -h omp +OPENACC_FLAG = -h acc +# CPU only +OPENACC_FLAG += -h omp +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj new file mode 100644 index 0000000..b4761e5 --- /dev/null +++ b/src/fortran/make.inc.fj @@ -0,0 +1,21 @@ +FC := frt +FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution + +DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED +ARRAY_FLAG = -Kparallel,reduction +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = +# CPU only +OPENACC_FLAG += +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OPENACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc new file mode 100644 index 0000000..f59c8bb --- /dev/null +++ b/src/fortran/make.inc.gcc @@ -0,0 +1,33 @@ +FC = gfortran +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 +ARRAY_FLAG = +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc new file mode 100644 index 0000000..dd4c442 --- /dev/null +++ b/src/fortran/make.inc.nvhpc @@ -0,0 +1,70 @@ +FC := nvfortran +#FCFLAGS := -O3 -Minform=inform -Minfo=all +FCFLAGS := -O3 -Minform=warn + +#TARGET=gpu +TARGET=multicore + +NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") +ifeq ($(findstring Ampere,$(NVARCH)),Ampere) + $(info Ampere detected) + GPU = cc80 +endif +ifeq ($(findstring Turing,$(NVARCH)),Turing) + $(info Turing detected) + GPU = cc75 +endif +ifeq ($(findstring Volta,$(NVARCH)),Volta) + $(info Volta detected) + GPU = cc70 +endif +ifeq ($(findstring Pascal,$(NVARCH)),Pascal) + $(info Pascal detected) + GPU = cc60,cc61 +endif +ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) + $(info Jetson AGX Orin detected) + GPU = ccn87,cc86 + # figure out Xavier later + #GPU = cc72 +endif +ifeq ($(GPU),) + $(error Your GPU architecture could not be detected. Set it manually.) +endif +GPUFLAG = -gpu=$(GPU) + +# MARCH=neoverse-v1,neoverse-n1,zen3 +ARCH=$(shell uname -m) +ifdef MARCH + ifeq ($(ARCH),aarch64) + ifeq ($(MARCH),neoverse-n1) + FCFLAGS += -tp=$(MARCH) + else + ifeq ($(MARCH),neoverse-v1) + FCFLAGS += -tp=$(MARCH) + else + FCFLAGS += -tp=native + endif + endif + else + FCFLAGS += -tp=$(MARCH) + endif +else + FCFLAGS += -tp=native +endif + +# this is to allow apples-to-apples comparison with DC in non-DC GPU impls +# set exactly one of these! +#MANAGED = -DUSE_MANAGED -gpu=managed +#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged + +DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) +ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) +OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) +OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) +CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi new file mode 100644 index 0000000..b7e003c --- /dev/null +++ b/src/fortran/make.inc.oneapi @@ -0,0 +1,32 @@ +FC := ifx +FCFLAGS = -std18 +FCFLAGS += -Ofast -xHOST +FCFLAGS += -qopt-zmm-usage=low + +ifeq ($(FC),ifort) + FCFLAGS += -qopt-streaming-stores=always + PARALLEL = -parallel +endif + +DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) +ARRAY_FLAG = -qopenmp $(PARALLEL) +OPENMP_FLAG = -qopenmp +ifeq ($(FC),ifx) + OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 +endif +OPENACC_FLAG = +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/run.sh b/src/fortran/run.sh new file mode 100755 index 0000000..2b41bab --- /dev/null +++ b/src/fortran/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +cat ./run.sh + +if [ `uname -s` == Darwin ] ; then + NUM_HWTHREADS=`sysctl -n hw.ncpu` + MEMORY_BYTES=`sysctl -n hw.memsize` +else + NUM_HWTHREADS=`nproc` + MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` +fi + +M=128 + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=close +export OMP_PLACES=threads + +export ACC_NUM_CORES=${OMP_NUM_THREADS} + +AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" + +for compiler in gcc nvhpc cray oneapi arm amd fj ; do + #if [ "x$compiler" == "xgcc" ] ; then + # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so + #fi + for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do + if [ -f BabelStream.${compiler}.${implementation} ] ; then + echo "BabelStream.${compiler}.${implementation}" + ldd BabelStream.${compiler}.${implementation} + time $AFFCONTROL \ + ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) + fi + done +done From 7e94495da6240bdda21b805002705d360da55c99 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 27 Jan 2023 14:28:13 +0000 Subject: [PATCH 18/24] Added ICPX support for the OneAPI2023 and later versions since DPCPP is deprecated --- src/sycl2020/model.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake index e7b5a1c..c60b9c9 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020/model.cmake @@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") @@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -65,6 +67,12 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") set(CMAKE_CXX_COMPILER dpcpp) register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () From 6a1122e5a34c0ff858f9666333939ecbca779c9d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 30 Jan 2023 12:14:17 +0000 Subject: [PATCH 19/24] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40e2c6d..54725bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. - RAJA CUDA CMake build issues resolved. - Fix CUDA memory limit check. - Use long double for `check_solution` in case of large problem size. +- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. ## [v4.0] - 2021-12-22 From 696ff6a8179b4fbb070070709c595f6e1a2f02a4 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Mon, 13 Mar 2023 10:47:37 -0500 Subject: [PATCH 20/24] Round up dot_num_blocks and remove extra check --- src/hip/HIPStream.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 7fc732d..0db8485 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -33,17 +33,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error(ss.str()); } - // The array size must be divisible by total number of elements - // moved per block for the dot kernel - if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) - { - std::stringstream ss; - ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" - << TBSIZE * dot_elements_per_lane - << ")."; - throw std::runtime_error(ss.str()); - } - // Set device int count; hipGetDeviceCount(&count); @@ -58,7 +47,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; - dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); + // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) + dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires From 8b862f09b3a9818bd3ae14353b0c036b714ebdba Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 May 2023 15:18:42 +0100 Subject: [PATCH 21/24] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54725bb..4efcbcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Fix CUDA memory limit check. - Use long double for `check_solution` in case of large problem size. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. +- Updates to the HIP kernels and API usage. ## [v4.0] - 2021-12-22 From 893af9f5d024a881ab5876bcdf567cd51a0ae478 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 10 Jun 2023 21:08:29 +0100 Subject: [PATCH 22/24] Fix compatibility with Kokkos 4+ --- src/kokkos/KokkosStream.cpp | 2 +- src/kokkos/KokkosStream.hpp | 3 --- src/kokkos/model.cmake | 12 ++++++------ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 00efe92..9cf32eb 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 3aa7cf5..a410a86 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,9 +10,6 @@ #include #include -#include -#include - #include "Stream.h" #define IMPLEMENTATION_STRING "Kokkos" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index a95fdba..927bc68 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -10,29 +10,29 @@ register_flag_optional(KOKKOS_IN_TREE See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") register_flag_optional(KOKKOS_IN_PACKAGE - "Use if Kokkos is part of a package dependency: - Path to package R-Path containing Kokkos libs" "") + "Absolute path to package R-Path containing Kokkos libs. + Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md if (EXISTS "${KOKKOS_IN_TREE}") - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") + message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) elseif (EXISTS "${KOKKOS_IN_PACKAGE}") - message(STATUS "Building using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") find_package(Kokkos REQUIRED) register_link_library(Kokkos::kokkos) else() - message(FATAL_ERROR "Neither `${KOKKOS_IN_TREE}`, or `${KOKKOS_IN_PACKAGE}` exists") + message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") endif () register_append_compiler_and_arch_specific_cxx_flags( From 092ee677647a35e3967849635ebf9fb8f888d3b4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 12 Jun 2023 15:49:59 +0100 Subject: [PATCH 23/24] Change CUDA DOT thread-blocks to 1024 This improves the performance on Ampere (A100) GPUs. Fixes #137. --- src/cuda/CUDAStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 83b8c66..bb3f866 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -22,7 +22,7 @@ #endif #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 +#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream From 7643de8d0964bbd084d660bbe8d6a8e900fd5726 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 12 Jun 2023 16:38:31 +0100 Subject: [PATCH 24/24] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4efcbcc..3b8aa1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Use long double for `check_solution` in case of large problem size. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. - Updates to the HIP kernels and API usage. +- Number of thread-blocks in CUDA dot kernel implementation changed to 1024. ## [v4.0] - 2021-12-22