Merge pull request #169 from UoB-HPC/develop

Merge develop for v5.0
This commit is contained in:
Tom Deakin 2023-10-12 11:11:33 +01:00 committed by GitHub
commit f3801aeac2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
115 changed files with 7984 additions and 1855 deletions

View File

@ -12,12 +12,12 @@ on:
jobs: jobs:
test-rust: test-rust:
runs-on: ubuntu-18.04 runs-on: ubuntu-22.04
defaults: defaults:
run: run:
working-directory: ./src/rust/rust-stream working-directory: ./src/rust/rust-stream
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Setup project - name: Setup project
run: rustup install nightly run: rustup install nightly
- name: Compile project - name: Compile project
@ -28,12 +28,12 @@ jobs:
run: ./target/release/rust-stream --arraysize 2048 run: ./target/release/rust-stream --arraysize 2048
test-java: test-java:
runs-on: ubuntu-18.04 runs-on: ubuntu-22.04
defaults: defaults:
run: run:
working-directory: ./src/java/java-stream working-directory: ./src/java/java-stream
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Test build project - name: Test build project
run: ./mvnw clean package run: ./mvnw clean package
- name: Test run - name: Test run
@ -41,12 +41,12 @@ jobs:
run: java -jar target/java-stream.jar --arraysize 2048 run: java -jar target/java-stream.jar --arraysize 2048
test-julia: test-julia:
runs-on: ubuntu-18.04 runs-on: ubuntu-22.04
defaults: defaults:
run: run:
working-directory: ./src/julia/JuliaStream.jl working-directory: ./src/julia/JuliaStream.jl
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Setup project - name: Setup project
run: julia --project -e 'import Pkg; Pkg.instantiate()' run: julia --project -e 'import Pkg; Pkg.instantiate()'
- name: Test run PlainStream.jl - name: Test run PlainStream.jl
@ -70,14 +70,22 @@ jobs:
test-cpp: test-cpp:
runs-on: ubuntu-18.04 runs-on: ubuntu-22.04
steps: steps:
- uses: actions/checkout@v2 - name: Maximize build space
uses: easimon/maximize-build-space@v8
with:
root-reserve-mb: 8192
swap-size-mb: 512
remove-android: 'true'
remove-codeql: 'true'
- uses: actions/checkout@v4
- name: Cache compiler - name: Cache compiler
if: ${{ !env.ACT }} if: ${{ !env.ACT }}
id: prepare-compilers id: prepare-compilers
uses: actions/cache@v2 uses: actions/cache@v3
with: with:
path: ./compilers path: ./compilers
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
@ -90,9 +98,9 @@ jobs:
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
# Enable tmate debugging of manually-triggered workflows if the input option was provided # Enable tmate debugging of manually-triggered workflows if the input option was provided
- name: Setup tmate session # - name: Setup tmate session
uses: mxschmitt/action-tmate@v3 # uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} # if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
- name: Test compile gcc @ CMake 3.13 - name: Test compile gcc @ CMake 3.13
if: ${{ ! cancelled() }} if: ${{ ! cancelled() }}
@ -168,3 +176,64 @@ jobs:
- name: Test compile hipsycl @ CMake 3.18 - name: Test compile hipsycl @ CMake 3.18
if: ${{ ! cancelled() }} if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile gcc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile clang @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile nvhpc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aocc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aomp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hip @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile dpcpp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hipsycl @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile gcc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile clang @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile nvhpc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aocc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aomp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hip @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile dpcpp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hipsycl @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}
test-futhark:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Prepare Futhark compiler
uses: diku-dk/install-futhark@HEAD
with:
version: 'latest'
- run: cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=multicore
- run: cmake --build build

6
.gitignore vendored
View File

@ -10,12 +10,18 @@ sycl-stream
hip-stream hip-stream
tbb-stream tbb-stream
src/fortran/BabelStream
src/fortran/BabelStream.*
*.o *.o
*.bc *.bc
*.sycl *.sycl
*.tar *.tar
*.gz *.gz
*.a *.a
*.mod
*.cub
*.ptx
KokkosCore_config.* KokkosCore_config.*

View File

@ -2,8 +2,32 @@
All notable changes to this project will be documented in this file. All notable changes to this project will be documented in this file.
## Unreleased ## Unreleased
### Added
- Ability to build Kokkos and RAJA versions against existing packages.
- Thrust managed memory.
- HIP managed memory.
- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`.
- New implementation in Fortran
- New implementation in [Futhark](https://futhark-lang.org/)
- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust
- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java
- JuliaStream.jl published to registry (pending #113)
### Changed ### Changed
- Fix std-data/std-indices compatibility with oneDPL, NVHPC, and AdaptiveCpp (a.k.a. hipSYCL).
- RAJA CUDA CMake build issues resolved. - RAJA CUDA CMake build issues resolved.
- Kokkos build updates (CXX version upgraded to C++17).
- Fix CUDA memory limit check.
- Fix CUDA CMake options for `-DMEM` and `-DCMAKE_CUDA_FLAGS`.
- Use long double for `check_solution` in case of large problem size.
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
- Updates to the HIP kernels and API usage.
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with AdaptiveCpp.
- Bumped Julia compat to 1.9
- Bumped Scala to 3.3.1
- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23)
- Upgrade CI to Ubuntu 22.04
## [v4.0] - 2021-12-22 ## [v4.0] - 2021-12-22

View File

@ -1,6 +1,10 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR) cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(BabelStream VERSION 4.0 LANGUAGES CXX) if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()
project(BabelStream VERSION 5.0 LANGUAGES CXX C)
# uncomment for debugging build issues: # uncomment for debugging build issues:
#set(CMAKE_VERBOSE_MAKEFILE ON) #set(CMAKE_VERBOSE_MAKEFILE ON)
@ -27,8 +31,6 @@ endmacro()
# the final executable name # the final executable name
set(EXE_NAME babelstream) set(EXE_NAME babelstream)
# select default build type
set(CMAKE_BUILD_TYPE "Release")
# for chrono and some basic CXX features, models can overwrite this if required # for chrono and some basic CXX features, models can overwrite this if required
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
@ -71,6 +73,75 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
# Honor user's CXX_EXTRA_LINK_FLAGS # Honor user's CXX_EXTRA_LINK_FLAGS
set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
don't explicitly link against TBB is a no-op, see description of your selected
model on how this is used." OFF)
option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
if (FETCH_TBB)
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
GIT_TAG "${FETCH_TBB_VERSION}"
)
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
set(TBB_STRICT OFF)
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(TBB)
if (NOT TBB_POPULATED)
FetchContent_Populate(TBB)
add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL)
endif ()
endif ()
option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
don't explicitly link against DPL is a no-op, see description of your selected
model on how this is used." OFF)
option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
if (FETCH_ONEDPL)
FetchContent_Declare(
oneDPL
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
GIT_TAG "${FETCH_ONEDPL_VERSION}"
)
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
if (ONEDPL_BACKEND STREQUAL "openmp")
set(ONEDPL_BACKEND omp)
endif ()
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(oneDPL)
if (NOT oneDPL_POPULATED)
FetchContent_Populate(oneDPL)
if (USE_TBB)
macro(find_package NAME)
if ("${NAME}" STREQUAL "TBB")
message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})")
else ()
_find_package(${NAME} ${ARGN})
endif ()
endmacro()
endif ()
add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL)
# Fixup oneDPL's omission on setting DPCPP definitions.
# We do this after the creation of the oneDPL target.
if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$")
target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1)
endif ()
endif ()
endif ()
# include our macros # include our macros
include(cmake/register_models.cmake) include(cmake/register_models.cmake)
@ -84,12 +155,14 @@ register_model(hip HIP HIPStream.cpp)
register_model(cuda CUDA CUDAStream.cu) register_model(cuda CUDA CUDAStream.cu)
register_model(kokkos KOKKOS KokkosStream.cpp) register_model(kokkos KOKKOS KokkosStream.cpp)
register_model(sycl SYCL SYCLStream.cpp) register_model(sycl SYCL SYCLStream.cpp)
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
register_model(acc ACC ACCStream.cpp) register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(raja USE_RAJA RAJAStream.cpp) register_model(raja USE_RAJA RAJAStream.cpp)
register_model(tbb TBB TBBStream.cpp) register_model(tbb TBB TBBStream.cpp)
register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
register_model(futhark FUTHARK FutharkStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -101,6 +174,12 @@ else ()
message(STATUS "Selected model : ${MODEL}") message(STATUS "Selected model : ${MODEL}")
endif () endif ()
if (MODEL STREQUAL "sycl2020")
message(FATAL_ERROR "
Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
endif ()
# load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL
load_model(${MODEL}) load_model(${MODEL})
@ -151,6 +230,7 @@ include_directories(src)
add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES})
if (CXX_EXTRA_LIBRARIES) if (CXX_EXTRA_LIBRARIES)
target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES}) target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})

View File

@ -38,9 +38,10 @@ BabelStream is currently implemented in the following parallel programming model
- C++ Parallel STL - C++ Parallel STL
- Kokkos - Kokkos
- RAJA - RAJA
- SYCL and SYCL 2020 - SYCL and SYCL2020 (USM and accessors)
- TBB - TBB
- Thrust (via CUDA or HIP) - Thrust (via CUDA or HIP)
- Futhark
This project also contains implementations in alternative languages with different build systems: This project also contains implementations in alternative languages with different build systems:
* Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl) * Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl)
@ -101,7 +102,7 @@ The source for each model's implementations are located in `./src/<model>`.
Currently available models are: Currently available models are:
``` ```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust;futhark
``` ```
#### Overriding default flags #### Overriding default flags
@ -165,7 +166,7 @@ The `MODEL` variant selects one implementation of BabelStream to build.
Currently available models are: Currently available models are:
``` ```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust
``` ```
### GNU Make ### GNU Make

2
src/.gitignore vendored
View File

@ -16,6 +16,8 @@
**/*.gz **/*.gz
**/*.a **/*.a
**/*.swp
**/KokkosCore_Config_* **/KokkosCore_Config_*
**/.DS_Store **/.DS_Store

View File

@ -149,7 +149,7 @@ void ACCStream<T>::nstream()
template <class T> template <class T>
T ACCStream<T>::dot() T ACCStream<T>::dot()
{ {
T sum = 0.0; T sum{};
int array_size = this->array_size; int array_size = this->array_size;
T * restrict a = this->a; T * restrict a = this->a;

View File

@ -83,6 +83,8 @@ get() {
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..." echo "$name not found, downloading..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
else
echo "$name found, skipping download..."
fi fi
fi fi
} }
@ -92,13 +94,15 @@ get_and_untar() {
local pkg_url="$2" local pkg_url="$2"
if [ "$SETUP" = true ]; then if [ "$SETUP" = true ]; then
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..." echo "$name not found, downloading ($pkg_url)..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
fi fi
echo "Preparing to extract $name ..." echo "Preparing to extract $name ..."
tar -xf "$name" tar -xf "$name"
echo "$name extracted, deleting archive ..." echo "$name extracted, deleting archive ..."
rm -f "$name" # delete for space rm -f "$name" # delete for space
else
echo "Skipping setup for $name ($pkg_url)..."
fi fi
} }
@ -119,10 +123,10 @@ verify_dir_exists() {
setup_aocc() { setup_aocc() {
echo "Preparing AOCC" echo "Preparing AOCC"
local aocc_ver="2.3.0" local aocc_ver="4.0.0"
local tarball="aocc-$aocc_ver.tar.xz" local tarball="aocc-$aocc_ver.tar.xz"
# XXX it's actually XZ compressed, so it should be tar.xz # XXX it's actually XZ compressed, so it should be tar.xz
local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar" local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
# local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar" # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
get_and_untar "$tarball" "$AOCC_URL" get_and_untar "$tarball" "$AOCC_URL"
@ -134,20 +138,26 @@ setup_aocc() {
setup_nvhpc() { setup_nvhpc() {
echo "Preparing Nvidia HPC SDK" echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz" local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" local nvhpc_release="2023_231"
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" local cuda_ver="12.0"
local tarball="nvhpc_$nvhpc_ver.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz"
get_and_untar "$tarball" "$url" get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
local bin_dir="$sdk_dir/compilers/bin" local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x "$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
export_var NVHPC_SDK_DIR "$sdk_dir" export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
export_var NVHPC_NVCXX "$bin_dir/nvc++" export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" export_var NVHPC_NVCC "$bin_dir/nvcc"
export_var NVHPC_CUDA_VER "$cuda_ver"
# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc"
echo "Installed CUDA versions:" echo "Installed CUDA versions:"
ls "$sdk_dir/cuda" ls "$sdk_dir/cuda"
@ -160,7 +170,8 @@ setup_nvhpc() {
setup_aomp() { setup_aomp() {
echo "Preparing AOMP" echo "Preparing AOMP"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb" local aomp_ver="18.0-0"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
# local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb" # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
get_and_install_deb "aomp" "aomp" "$AOMP_URL" get_and_install_deb "aomp" "aomp" "$AOMP_URL"
@ -183,9 +194,10 @@ setup_oclcpu() {
setup_kokkos() { setup_kokkos() {
echo "Preparing Kokkos" echo "Preparing Kokkos"
local kokkos_ver="3.3.01" local kokkos_ver="4.1.00"
local tarball="kokkos-$kokkos_ver.tar.gz" local tarball="kokkos-$kokkos_ver.tar.gz"
local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz" local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
# local url="http://localhost:8000/$kokkos_ver.tar.gz" # local url="http://localhost:8000/$kokkos_ver.tar.gz"
@ -197,10 +209,10 @@ setup_kokkos() {
setup_raja() { setup_raja() {
echo "Preparing RAJA" echo "Preparing RAJA"
local raja_ver="0.13.0" local raja_ver="2023.06.1"
local tarball="raja-$raja_ver.tar.gz" local tarball="raja-$raja_ver.tar.gz"
local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz" local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
# local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz" # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
get_and_untar "$tarball" "$url" get_and_untar "$tarball" "$url"
@ -211,7 +223,7 @@ setup_raja() {
setup_tbb() { setup_tbb() {
echo "Preparing TBB" echo "Preparing TBB"
local tbb_ver="2021.2.0" local tbb_ver="2021.9.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz" local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
@ -225,9 +237,9 @@ setup_tbb() {
setup_clang_gcc() { setup_clang_gcc() {
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
export_var GCC_CXX "$(which g++-10)" export_var GCC_CXX "$(which g++-12)"
verify_bin_exists "$GCC_CXX" verify_bin_exists "$GCC_CXX"
"$GCC_CXX" --version "$GCC_CXX" --version
@ -248,7 +260,11 @@ setup_clang_gcc() {
} }
setup_rocm() { setup_rocm() {
sudo apt-get install -y -qq rocm-dev rocthrust-dev if [ "$SETUP" = true ]; then
sudo apt-get install -y rocm-dev rocthrust-dev
else
echo "Skipping apt setup for ROCm"
fi
export_var ROCM_PATH "/opt/rocm" export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc" export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
@ -259,7 +275,7 @@ setup_rocm() {
setup_dpcpp() { setup_dpcpp() {
local nightly="20210106" local nightly="20230615"
local tarball="dpcpp-$nightly.tar.gz" local tarball="dpcpp-$nightly.tar.gz"
local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz" local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
@ -276,22 +292,22 @@ setup_dpcpp() {
setup_hipsycl() { setup_hipsycl() {
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
local hipsycl_ver="0.9.0" local hipsycl_ver="0.9.1"
local tarball="v$hipsycl_ver.tar.gz" local tarball="v$hipsycl_ver.tar.gz"
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver" local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz" local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz" # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
get_and_untar "$tarball" "$url" get_and_untar "$tarball" "$url"
if [ "$SETUP" = true ]; then if [ "$SETUP" = true ]; then
local src="$PWD/hipSYCL-$hipsycl_ver" local src="$PWD/AdaptiveCpp-$hipsycl_ver"
rm -rf "$src/build" rm -rf "$src/build"
rm -rf "$install_dir" rm -rf "$install_dir"
cmake "-B$src/build" "-H$src" \ cmake "-B$src/build" "-H$src" \
-DCMAKE_C_COMPILER="$(which gcc-10)" \ -DCMAKE_C_COMPILER="$(which gcc-12)" \
-DCMAKE_CXX_COMPILER="$(which g++-10)" \ -DCMAKE_CXX_COMPILER="$(which g++-12)" \
-DCMAKE_INSTALL_PREFIX="$install_dir" \ -DCMAKE_INSTALL_PREFIX="$install_dir" \
-DWITH_ROCM_BACKEND=OFF \ -DWITH_ROCM_BACKEND=OFF \
-DWITH_CUDA_BACKEND=OFF \ -DWITH_CUDA_BACKEND=OFF \
@ -306,25 +322,20 @@ setup_hipsycl() {
check_size check_size
} }
setup_computecpp() {
echo "TODO ComputeCpp requires registration+login to download"
}
if [ "${GITHUB_ACTIONS:-false}" = true ]; then if [ "${GITHUB_ACTIONS:-false}" = true ]; then
echo "Running in GitHub Actions, defaulting to special export" echo "Running in GitHub Actions, defaulting to special export"
TERM=xterm TERM=xterm
export TERM=xterm export TERM=xterm
# drop the lock in case we got one from a failed run # drop the lock in case we got one from a failed run
rm /var/lib/dpkg/lock-frontend || true rm -rf /var/lib/dpkg/lock-frontend || true
rm /var/cache/apt/archives/lock || true rm -rf /var/cache/apt/archives/lock || true
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt-get update -qq sudo apt-get update -qq
sudo apt-get install -y -qq cmake sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
if [ "$SETUP" = true ]; then if [ "$SETUP" = true ]; then
echo "Deleting extra packages for space in 2 seconds..." echo "Deleting extra packages for space in 2 seconds..."
@ -334,6 +345,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
sudo apt-get autoremove -y sudo apt-get autoremove -y
check_size check_size
fi fi
sudo apt-get upgrade -qq
else else
echo "Running locally, defaulting to standard export" echo "Running locally, defaulting to standard export"
fi fi
@ -362,6 +374,18 @@ setup_cmake() {
verify_bin_exists "$CMAKE_3_18_BIN" verify_bin_exists "$CMAKE_3_18_BIN"
"$CMAKE_3_18_BIN" --version "$CMAKE_3_18_BIN" --version
get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_20_BIN"
"$CMAKE_3_20_BIN" --version
get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_24_BIN"
"$CMAKE_3_24_BIN" --version
check_size check_size
} }
@ -379,6 +403,10 @@ if [ "$PARALLEL" = true ]; then
setup_tbb & setup_tbb &
wait wait
else else
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
setup_cmake setup_cmake
setup_aocc setup_aocc
setup_oclcpu setup_oclcpu
@ -388,10 +416,6 @@ else
setup_kokkos setup_kokkos
setup_raja setup_raja
setup_tbb setup_tbb
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
fi fi
echo "Done!" echo "Done!"

View File

@ -120,9 +120,20 @@ run_build() {
# CLANG_OMP_OFFLOAD_NVIDIA=false # CLANG_OMP_OFFLOAD_NVIDIA=false
### ###
NV_ARCH_CC="70"
AMD_ARCH="gfx_903" AMD_ARCH="gfx_903"
NV_ARCH="sm_70" NV_ARCH="sm_${NV_ARCH_CC}"
NV_ARCH_CCXY="cuda11.4,cc80" NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
check_cmake_ver(){
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required=$1
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
return 0
else
return 1
fi
}
build_gcc() { build_gcc() {
local name="gcc_build" local name="gcc_build"
@ -135,49 +146,61 @@ build_gcc() {
"./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10 "./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10
fi fi
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here for use_onedpl in OFF OPENMP TBB; do
run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" case "$use_onedpl" in
run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;;
run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
done
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
fi fi
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi fi
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" if check_cmake_ver "3.16.0"; then
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" if check_cmake_ver "3.20.0"; then
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 if check_cmake_ver "3.20.0"; then
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-DENABLE_CUDA=ON \
-DTARGET=NVIDIA \
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-DCUDA_ARCH=$NV_ARCH"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
# -DENABLE_CUDA=ON \ run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
# -DTARGET=NVIDIA \ # run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error: # FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
@ -187,9 +210,9 @@ build_gcc() {
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else else
echo "CMake version ${current} < ${required}, skipping Thrust models" echo "Skipping Thrust models due to CMake version requirement"
fi fi
} }
@ -207,28 +230,39 @@ build_clang() {
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi fi
if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" if check_cmake_ver "3.16.0"; then
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" for use_onedpl in OFF OPENMP TBB; do
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported case "$use_onedpl" in
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" esac
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" done
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# no clang /w RAJA+cuda because it needs nvcc which needs gcc # no clang /w RAJA+cuda because it needs nvcc which needs gcc
} }
@ -237,6 +271,7 @@ build_nvhpc() {
local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
} }
@ -254,6 +289,8 @@ build_hip() {
local name="hip_build" local name="hip_build"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT"
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
} }
@ -275,15 +312,18 @@ build_icpc() {
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
run_build $name "${ICPC_CXX:?}" omp "$cxx" run_build $name "${ICPC_CXX:?}" omp "$cxx"
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" if check_cmake_ver "3.20.0"; then
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
} else
echo "Skipping RAJA models due to CMake version requirement"
fi
if check_cmake_ver "3.16.0"; then
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
build_computecpp() {
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
-DSYCL_COMPILER=COMPUTECPP \
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
-DOpenCL_LIBRARY=${OCL_LIB:?}"
} }
build_dpcpp() { build_dpcpp() {

View File

@ -42,41 +42,57 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
// Print out device information // Print out device information
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
// Query device for sensible dot kernel block count
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_index);
check_error();
dot_num_blocks = props.multiProcessorCount * 4;
// Allocate the host array for partial sums for dot kernels // Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); sums = (T*)malloc(sizeof(T) * dot_num_blocks);
size_t array_bytes = sizeof(T);
array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 4;
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
// Check buffers fit on the device // Check buffers fit on the device
cudaDeviceProp props; if (props.totalGlobalMem < total_bytes)
cudaGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers"); throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers // Create device buffers
#if defined(MANAGED) #if defined(MANAGED)
cudaMallocManaged(&d_a, ARRAY_SIZE*sizeof(T)); cudaMallocManaged(&d_a, array_bytes);
check_error(); check_error();
cudaMallocManaged(&d_b, ARRAY_SIZE*sizeof(T)); cudaMallocManaged(&d_b, array_bytes);
check_error(); check_error();
cudaMallocManaged(&d_c, ARRAY_SIZE*sizeof(T)); cudaMallocManaged(&d_c, array_bytes);
check_error(); check_error();
cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T));
check_error(); check_error();
#elif defined(PAGEFAULT) #elif defined(PAGEFAULT)
d_a = (T*)malloc(sizeof(T)*ARRAY_SIZE); d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(sizeof(T)*ARRAY_SIZE); d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(sizeof(T)*ARRAY_SIZE); d_c = (T*)malloc(array_bytes);
d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS); d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
#else #else
cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); cudaMalloc(&d_a, array_bytes);
check_error(); check_error();
cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); cudaMalloc(&d_b, array_bytes);
check_error(); check_error();
cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); cudaMalloc(&d_c, array_bytes);
check_error(); check_error();
cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); cudaMalloc(&d_sum, dot_num_blocks*sizeof(T));
check_error(); check_error();
#endif #endif
} }
@ -237,7 +253,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
const size_t local_i = threadIdx.x; const size_t local_i = threadIdx.x;
tb_sum[local_i] = 0.0; tb_sum[local_i] = {};
for (; i < array_size; i += blockDim.x*gridDim.x) for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i]; tb_sum[local_i] += a[i] * b[i];
@ -257,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
template <class T> template <class T>
T CUDAStream<T>::dot() T CUDAStream<T>::dot()
{ {
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size); dot_kernel<<<dot_num_blocks, TBSIZE>>>(d_a, d_b, d_sum, array_size);
check_error(); check_error();
#if defined(MANAGED) || defined(PAGEFAULT) #if defined(MANAGED) || defined(PAGEFAULT)
cudaDeviceSynchronize(); cudaDeviceSynchronize();
check_error(); check_error();
#else #else
cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost);
check_error(); check_error();
#endif #endif
T sum = 0.0; T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++) for (int i = 0; i < dot_num_blocks; i++)
{ {
#if defined(MANAGED) || defined(PAGEFAULT) #if defined(MANAGED) || defined(PAGEFAULT)
sum += d_sum[i]; sum += d_sum[i];

View File

@ -13,16 +13,9 @@
#include "Stream.h" #include "Stream.h"
#if defined(PAGEFAULT) #define IMPLEMENTATION_STRING "CUDA"
#define IMPLEMENTATION_STRING "CUDA - Page Fault"
#elif defined(MANAGED)
#define IMPLEMENTATION_STRING "CUDA - Managed Memory"
#else
#define IMPLEMENTATION_STRING "CUDA"
#endif
#define TBSIZE 1024 #define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
template <class T> template <class T>
class CUDAStream : public Stream<T> class CUDAStream : public Stream<T>
@ -40,6 +33,8 @@ class CUDAStream : public Stream<T>
T *d_c; T *d_c;
T *d_sum; T *d_sum;
// Number of blocks for dot kernel
int dot_num_blocks;
public: public:

View File

@ -29,10 +29,11 @@ macro(setup)
endif() endif()
enable_language(CUDA) enable_language(CUDA)
register_definitions(MEM=${MEM}) register_definitions(${MEM})
# add -forward-unknown-to-host-compiler for compatibility reasons # add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later # appended later

76
src/dpl_shim.h Normal file
View File

@ -0,0 +1,76 @@
#pragma once
#include <cstdlib>
#include <cstddef>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_ONEDPL
// oneDPL C++17 PSTL
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/numeric>
#if ONEDPL_USE_DPCPP_BACKEND
#include <CL/sycl.hpp>
const static auto exe_policy = oneapi::dpl::execution::device_policy<>{
oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})
};
template<typename T>
T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue()); }
template<typename T>
void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
#else
// auto exe_policy = dpl::execution::seq;
// auto exe_policy = dpl::execution::par;
static constexpr auto exe_policy = dpl::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#else
// Normal C++17 PSTL
#include <algorithm>
#include <execution>
#include <numeric>
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
static constexpr auto exe_policy = std::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#ifdef USE_STD_PTR_ALLOC_DEALLOC
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
#include <CL/sycl.hpp>
// TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete
// for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled
static cl::sycl::queue queue{cl::sycl::default_selector_v};
template <typename T> T *alloc_raw(size_t size) { return cl::sycl::malloc_shared<T>(size, queue); }
template <typename T> void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); }
#else
template<typename T>
T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); }
template<typename T>
void dealloc_raw(T *ptr) { free(ptr); }
#endif
#endif

105
src/fortran/ArrayStream.F90 Normal file
View File

@ -0,0 +1,105 @@
module ArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=5), parameter :: implementation_name = "Array"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
A = initA
B = initB
C = initC
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
h_A = A
h_B = B
h_C = C
end subroutine read_arrays
subroutine copy()
implicit none
C = A
end subroutine copy
subroutine add()
implicit none
C = A + B
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
B = scalar * C
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = B + scalar * C
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = A + B + scalar * C
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
s = dot_product(A,B)
end function dot
end module ArrayStream

View File

@ -0,0 +1,21 @@
module BabelStreamTypes
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
implicit none
#ifdef USE_FLOAT
integer, parameter :: StreamRealKind = REAL32
character(len=6) :: StreamRealName = "REAL32"
#else
integer, parameter :: StreamRealKind = REAL64
character(len=6) :: StreamRealName = "REAL64"
#endif
#ifdef USE_INT32
#warning There is no checking for overflowing INT32, so be careful.
integer, parameter :: StreamIntKind = INT32
#else
integer, parameter :: StreamIntKind = INT64
#endif
end module BabelStreamTypes

View File

@ -0,0 +1,230 @@
module CUDAKernelStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "CUDAKernel"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i) + B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
B(i) = scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer(kind=StreamIntKind) :: i
integer :: err
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAKernelStream

309
src/fortran/CUDAStream.F90 Normal file
View File

@ -0,0 +1,309 @@
module CUDAFortranKernels
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
contains
attributes(global) subroutine do_copy(n,A,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i)
endif
end subroutine do_copy
attributes(global) subroutine do_add(n,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i) + B(i)
endif
end subroutine do_add
attributes(global) subroutine do_mul(n,scalar,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: B(n)
real(kind=REAL64), intent(in) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
B(i) = scalar * C(i)
endif
end subroutine do_mul
attributes(global) subroutine do_triad(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = B(i) + scalar * C(i)
endif
end subroutine do_triad
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(inout) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = A(i) + B(i) + scalar * C(i)
endif
end subroutine do_nstream
#if 0
attributes(global) subroutine do_dot(n,A,B,r)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: r
integer(kind=StreamIntKind) :: i
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
end subroutine do_dot
#endif
end module CUDAFortranKernels
module CUDAStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
use cudafor, only: dim3
implicit none
character(len=4), parameter :: implementation_name = "CUDA"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
type(dim3) :: grid, tblock
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
! move to separate subroutine later
tblock = dim3(128,1,1)
grid = dim3(ceiling(real(N)/tblock%x),1,1)
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use CUDAFortranKernels, only: do_copy
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_copy<<<grid, tblock>>>(N, A, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use CUDAFortranKernels, only: do_add
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_add<<<grid, tblock>>>(N, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use CUDAFortranKernels, only: do_mul
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use CUDAFortranKernels, only: do_triad
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use CUDAFortranKernels, only: do_nstream
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
!use CUDAFortranKernels, only: do_dot
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer :: err
integer(kind=StreamIntKind) :: i
!call do_dot<<<grid, tblock>>>(N, B, C, r)
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAStream

View File

@ -0,0 +1,139 @@
module DoConcurrentStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "DoConcurrent"
integer(kind=StreamIntKind) :: N
#ifdef USE_DEVICE
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N)
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,C)
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(B,C)
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
! reduction omitted because NVF infers it and other compilers do not support
s = real(0,kind=REAL64)
#ifdef CRAY_THREAD_DOCONCURRENT
do i=1,N
#else
do concurrent (i=1:N) !shared(A,B)
#endif
s = s + A(i) * B(i)
end do
end function dot
end module DoConcurrentStream

109
src/fortran/Makefile Normal file
View File

@ -0,0 +1,109 @@
ifeq ($(COMPILER),nvhpc)
include make.inc.nvhpc
else ifeq ($(COMPILER),oneapi)
include make.inc.oneapi
else ifeq ($(COMPILER),gcc)
include make.inc.gcc
else ifeq ($(COMPILER),amd)
include make.inc.amd
else ifeq ($(COMPILER),arm)
include make.inc.arm
else ifeq ($(COMPILER),cray)
include make.inc.cray
else ifeq ($(COMPILER),fj)
include make.inc.fj
else
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
include make.inc.gcc
COMPILER=gcc
endif
FCFLAGS += -DVERSION_STRING="5.0"
#FCFLAGS += -DUSE_INT32
ifeq ($(IMPLEMENTATION),DoConcurrent)
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
else ifeq ($(IMPLEMENTATION),Array)
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
IMPLEMENTATION_OBJECT = ArrayStream.o
else ifeq ($(IMPLEMENTATION),OpenMP)
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPStream.o
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
else ifeq ($(IMPLEMENTATION),OpenACC)
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCStream.o
else ifeq ($(IMPLEMENTATION),OpenACCArray)
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
else ifeq ($(IMPLEMENTATION),CUDA)
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAStream.o
else ifeq ($(IMPLEMENTATION),CUDAKernel)
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
else ifeq ($(IMPLEMENTATION),Sequential)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION_OBJECT = SequentialStream.o
else
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION=Sequential
IMPLEMENTATION_OBJECT = SequentialStream.o
endif
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
$(FC) $(FCFLAGS) -c $<
%.o: %.F90 BabelStreamTypes.mod
$(FC) $(FCFLAGS) -c $<
clean:
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
-rm -f *.modmic *.mod *.o *.cub *.ptx
realclean: clean
-rm -f BabelStream.*

View File

@ -0,0 +1,144 @@
module OpenACCArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenACCArray"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$acc kernels
A = initA
B = initB
C = initC
!$acc end kernels
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$acc kernels
h_A = A
h_B = B
h_C = C
!$acc end kernels
end subroutine read_arrays
subroutine copy()
implicit none
!$acc kernels
C = A
!$acc end kernels
end subroutine copy
subroutine add()
implicit none
!$acc kernels
C = A + B
!$acc end kernels
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
B = scalar * C
!$acc end kernels
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = B + scalar * C
!$acc end kernels
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = A + B + scalar * C
!$acc end kernels
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$acc kernels
s = dot_product(A,B)
!$acc end kernels
end function dot
end module OpenACCArrayStream

View File

@ -0,0 +1,161 @@
module OpenACCStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=7), parameter :: implementation_name = "OpenACC"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$acc parallel loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenACCStream

View File

@ -0,0 +1,137 @@
module OpenMPStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=6), parameter :: implementation_name = "OpenMP"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPStream

View File

@ -0,0 +1,162 @@
module OpenMPTargetLoopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetLoopStream

View File

@ -0,0 +1,163 @@
module OpenMPTargetStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenMPTarget"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams distribute parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i)
end do
!$omp barrier
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams distribute parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetStream

View File

@ -0,0 +1,169 @@
module OpenMPTaskloopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
!$omp end master
!$omp end parallel
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
!$omp end master
!$omp end parallel
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i)
end do
!$omp end master
!$omp end parallel
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i) + B(i)
end do
!$omp end master
!$omp end parallel
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
B(i) = scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel
!$omp master
!$omp taskloop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
!$omp end master
!$omp end parallel
end function dot
end module OpenMPTaskloopStream

View File

@ -0,0 +1,120 @@
module OpenMPWorkshareStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$omp parallel workshare
A = initA
B = initB
C = initC
!$omp end parallel workshare
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$omp parallel workshare
h_A = A
h_B = B
h_C = C
!$omp end parallel workshare
end subroutine read_arrays
subroutine copy()
implicit none
!$omp parallel workshare
C = A
!$omp end parallel workshare
end subroutine copy
subroutine add()
implicit none
!$omp parallel workshare
C = A + B
!$omp end parallel workshare
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
B = scalar * C
!$omp end parallel workshare
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = B + scalar * C
!$omp end parallel workshare
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = A + B + scalar * C
!$omp end parallel workshare
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$omp parallel workshare
s = dot_product(A,B)
!$omp end parallel workshare
end function dot
end module OpenMPWorkshareStream

View File

@ -0,0 +1,130 @@
module SequentialStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "Sequential"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module SequentialStream

54
src/fortran/build.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# uncomment to disable GPU targets
#HAS_GPU=0
# Orin
#if [ "x${compiler}" == "xgcc" ] ; then
# export MCPU=cortex-a78ae
#fi
#if [ "x${compiler}" == "xarm" ] ; then
# export MCPU=cortex-a78
#fi
COMPILERS="gcc"
if [ $(which nvfortran) ] ; then
COMPILERS="${COMPILERS} nvhpc"
fi
if [ $(which crayftn) ] ; then
COMPILERS="${COMPILERS} cray"
fi
if [ $(uname -m) == "aarch64" ] ; then
if [ $(which armflang) ] ; then
COMPILERS="${COMPILERS} arm"
fi
if [ $(which frt) ] ; then
COMPILERS="${COMPILERS} fj"
fi
elif [ $(uname -m) == "x86_64" ] ; then
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
COMPILERS="${COMPILERS} oneapi"
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
. /opt/intel/oneapi/setvars.sh >& /dev/null
fi
else
# ^ this detection can be improved
COMPILERS="${COMPILERS} amd"
fi
fi
for compiler in ${COMPILERS} ; do
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
if [ "${HAS_GPU}" != "0" ] ; then
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
if [ "x${compiler}" == "xnvhpc" ] ; then
TARGETS="${TARGETS} CUDA CUDAKernel"
fi
fi
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
TARGETS="${TARGETS} OpenACC OpenACCArray"
fi
for implementation in ${TARGETS} ; do
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
done
done

683
src/fortran/main.F90 Normal file
View File

@ -0,0 +1,683 @@
module BabelStreamUtil
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
use BabelStreamTypes
implicit none
integer(kind=StreamIntKind) :: array_size = 33554432
integer(kind=StreamIntKind) :: num_times = 100
logical :: mibibytes = .false.
logical :: use_gigs = .false.
logical :: csv = .false.
character(len=1), parameter :: csv_sep = ","
! 1 = All
! 2 = Triad
! 3 = Nstream
integer :: selection = 1
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
contains
function get_wtime() result(t)
#if defined(USE_OMP_GET_WTIME)
use omp_lib
implicit none
real(kind=REAL64) :: t
t = omp_get_wtime()
#elif defined(USE_CPU_TIME)
implicit none
real(kind=REAL64) :: t
real :: r
call cpu_time(r)
t = r
#else
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
#endif
end function get_wtime
subroutine parseArguments()
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream, only: list_devices, set_device
#elif defined(USE_ARRAY)
use ArrayStream, only: list_devices, set_device
#elif defined(USE_OPENMP)
use OpenMPStream, only: list_devices, set_device
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream, only: list_devices, set_device
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream, only: list_devices, set_device
#elif defined(USE_OPENACC)
use OpenACCStream, only: list_devices, set_device
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream, only: list_devices, set_device
#elif defined(USE_CUDA)
use CUDAStream, only: list_devices, set_device
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream, only: list_devices, set_device
#elif defined(USE_SEQUENTIAL)
use SequentialStream, only: list_devices, set_device
#endif
implicit none
integer :: i, argc
integer :: arglen,err,pos(2)
character(len=64) :: argtmp
argc = command_argument_count()
do i=1,argc
call get_command_argument(i,argtmp,arglen,err)
if (err.eq.0) then
!
! list devices
!
pos(1) = index(argtmp,"--list")
if (pos(1).eq.1) then
call list_devices()
stop
endif
!
! set device number
!
pos(1) = index(argtmp,"--device")
if (pos(1).eq.1) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
stop
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer :: dev
read(argtmp,'(i15)') dev
call set_device(dev)
end block
endif
cycle
endif
!
! array size
!
pos(1) = index(argtmp,"--arraysize")
pos(2) = index(argtmp,"-s")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer(kind=INT64) :: big_size
read(argtmp,'(i15)') big_size
if (big_size .gt. HUGE(array_size)) then
print*,'Array size does not fit into integer:'
print*,big_size,'>',HUGE(array_size)
print*,'Stop using USE_INT32'
stop
else
array_size = INT(big_size,kind=StreamIntKind)
endif
end block
endif
cycle
endif
!
! number of iterations
!
pos(1) = index(argtmp,"--numtimes")
pos(2) = index(argtmp,"-n")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
read(argtmp,'(i15)') num_times
if (num_times.lt.2) then
write(*,'(a)') "Number of times must be 2 or more"
stop
end if
endif
cycle
endif
!
! precision
!
pos(1) = index(argtmp,"--float")
if (pos(1).eq.1) then
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
"to run BabelStream in single precision."
stop
endif
!
! selection (All, Triad, Nstream)
!
pos(1) = index(argtmp,"--triad-only")
if (pos(1).eq.1) then
selection = 2
cycle
endif
pos(1) = index(argtmp,"--nstream-only")
if (pos(1).eq.1) then
selection = 3
cycle
endif
!
! CSV
!
pos(1) = index(argtmp,"--csv")
if (pos(1).eq.1) then
csv = .true.
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
!stop
endif
!
! units
!
pos(1) = index(argtmp,"--mibibytes")
if (pos(1).eq.1) then
mibibytes = .true.
cycle
endif
!
! giga/gibi instead of mega/mebi
!
pos(1) = index(argtmp,"--gigs")
if (pos(1).eq.1) then
use_gigs = .true.
cycle
endif
!
!
!
pos(1) = index(argtmp,"--compiler-info")
if (pos(1).eq.1) then
write(*,'(a)') 'Compiler version: ',compiler_version()
write(*,'(a)') 'Compiler options: ',compiler_options()
stop
endif
!
! help
!
pos(1) = index(argtmp,"--help")
pos(2) = index(argtmp,"-h")
if (any(pos(:).eq.1) ) then
call get_command_argument(0,argtmp,arglen,err)
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
write(*,'(a)') "Options:"
write(*,'(a)') " -h --help Print the message"
write(*,'(a)') " --list List available devices"
write(*,'(a)') " --device INDEX Select device at INDEX"
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
!write(*,'(a)') " --float Use floats (rather than doubles)"
write(*,'(a)') " --triad-only Only run triad"
write(*,'(a)') " --nstream-only Only run nstream"
write(*,'(a)') " --csv Output as csv table"
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
stop
endif
end if
end do
end subroutine parseArguments
subroutine run_all(timings, summ)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64), intent(out) :: summ
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call copy()
t2 = get_wtime()
timings(1,i) = t2-t1
t1 = get_wtime()
call mul(startScalar)
t2 = get_wtime()
timings(2,i) = t2-t1
t1 = get_wtime()
call add()
t2 = get_wtime()
timings(3,i) = t2-t1
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(4,i) = t2-t1
t1 = get_wtime()
summ = dot()
t2 = get_wtime()
timings(5,i) = t2-t1
end do
end subroutine run_all
subroutine run_triad(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_triad
subroutine run_nstream(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call nstream(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_nstream
subroutine check_solution(A, B, C, summ)
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
implicit none
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
real(kind=REAL64), intent(in) :: summ
integer(kind=StreamIntKind) :: i
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
real(kind=REAL64) :: scalar
! always use double because of accumulation error
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
logical :: cleanA, cleanB, cleanC, cleanSum
goldA = startA
goldB = startB
goldC = startC
goldSum = 0.0d0
scalar = startScalar
do i=1,num_times
if (selection.eq.1) then
goldC = goldA
goldB = scalar * goldC
goldC = goldA + goldB
goldA = goldB + scalar * goldC
else if (selection.eq.2) then
goldA = goldB + scalar * goldC
else if (selection.eq.3) then
goldA = goldA + goldB + scalar * goldC;
endif
end do
goldSum = goldA * goldB * array_size
cleanA = ALL(IEEE_Is_Normal(A))
cleanB = ALL(IEEE_Is_Normal(B))
cleanC = ALL(IEEE_Is_Normal(C))
cleanSum = IEEE_Is_Normal(summ)
if (.not. cleanA) then
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanB) then
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanC) then
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanSum) then
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
end if
errA = SUM( ABS( A - goldA ) ) / array_size
errB = SUM( ABS( B - goldB ) ) / array_size
errC = SUM( ABS( C - goldC ) ) / array_size
errSum = ABS( (summ - goldSum) / goldSum)
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
if (errA .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
end if
if (errB .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
end if
if (errC .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
end if
if (selection.eq.1) then
if (errSum .gt. 1.0e-8) then
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
end if
endif
end subroutine check_solution
end module BabelStreamUtil
program BabelStream
use BabelStreamUtil
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
integer :: element_size, err
real(kind=REAL64) :: scaling
character(len=3) :: label
real(kind=REAL64), allocatable :: timings(:,:)
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
real(kind=REAL64) :: summ
real(kind=REAL64) :: init_tic, init_toc, read_tic, read_toc
call parseArguments()
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
if (mibibytes) then
if (use_gigs) then
scaling = 2.0d0**(-30)
label = "GiB"
else
scaling = 2.0d0**(-20)
label = "MiB"
endif
else
if (use_gigs) then
scaling = 1.0d-9
label = "GB"
else
scaling = 1.0d-6
label = "MB"
endif
endif
if (.not.csv) then
write(*,'(a)') "BabelStream Fortran"
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
write(*,'(a16,a)') "Implementation: ", implementation_name
block
character(len=32) :: printout
write(printout,'(i9,1x,a5)') num_times,'times'
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
end block
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
endif ! csv
allocate( timings(5,num_times) )
call alloc(array_size)
init_tic = get_wtime()
call init_arrays(startA, startB, startC)
init_toc = get_wtime()
summ = 0.0d0
if (.not.csv) then
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Init: ',init_toc-init_tic, 's (=', &
(3.0d0 * element_size * array_size * scaling) / (init_toc-init_tic), TRIM(label), 'ytes/sec)'
end if
timings = -1.0d0
if (selection.eq.1) then
call run_all(timings, summ)
else if (selection.eq.2) then
call run_triad(timings)
else if (selection.eq.3) then
call run_nstream(timings)
endif
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
read_tic = get_wtime()
call read_arrays(h_A, h_B, h_C)
read_toc = get_wtime()
if (.not.csv) then
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Read: ',read_toc-read_tic, 's (=', &
(3.0d0 * element_size * array_size * scaling) / (read_toc-read_tic), TRIM(label), 'ytes/sec)'
end if
call check_solution(h_A, h_B, h_C, summ)
block
character(len=20) :: printout(8)
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
if (csv) then
write(*,'(a,a1)',advance='no') 'function', csv_sep
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
if (mibibytes) then
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
else
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
endif
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
write(*,'(a,a1)',advance='yes') 'avg_runtime'
else
write(printout(1),'(a8)') 'Function'
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
write(printout(3),'(a9)') 'Min (sec)'
write(printout(4),'(a3)') 'Max'
write(printout(5),'(a7)') 'Average'
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif ! csv
if (selection.eq.1) then
block
integer, parameter :: sizes(5) = [2,2,3,3,2]
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
integer :: i
do i=1,5
tmin = MINVAL(timings(i,2:num_times))
tmax = MAXVAL(timings(i,2:num_times))
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
write(printout(1),'(a)') labels(i)
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
enddo
end block
else if ((selection.eq.2).or.(selection.eq.3)) then
tmin = MINVAL(timings(1,2:num_times))
tmax = MAXVAL(timings(1,2:num_times))
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
if (selection.eq.2) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
write(printout(1),'(a12)') "Triad"
else if (selection.eq.3) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
write(printout(1),'(a12)') "Nstream"
endif
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
endif
end block
call dealloc()
end program BabelStream

25
src/fortran/make.inc.amd Normal file
View File

@ -0,0 +1,25 @@
FC := /opt/rocm/llvm/bin/flang
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
FCFLAGS := -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
ARRAY_FLAG = -fopenmp # libomp.so required
OPENMP_FLAG = -fopenmp
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

39
src/fortran/make.inc.arm Normal file
View File

@ -0,0 +1,39 @@
FC = armflang
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -fopenmp
ARRAY_FLAG = -fopenmp
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

18
src/fortran/make.inc.cray Normal file
View File

@ -0,0 +1,18 @@
FC := ftn
FCFLAGS = -e F -O3
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
ARRAY_FLAG = -h autothread
OPENMP_FLAG = -h omp
OPENACC_FLAG = -h acc
# CPU only
OPENACC_FLAG += -h omp
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

21
src/fortran/make.inc.fj Normal file
View File

@ -0,0 +1,21 @@
FC := frt
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
ARRAY_FLAG = -Kparallel,reduction
OPENMP_FLAG = -fopenmp
OPENACC_FLAG =
# CPU only
OPENACC_FLAG +=
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OPENACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

33
src/fortran/make.inc.gcc Normal file
View File

@ -0,0 +1,33 @@
FC = gfortran
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
ARRAY_FLAG =
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

View File

@ -0,0 +1,70 @@
FC := nvfortran
#FCFLAGS := -O3 -Minform=inform -Minfo=all
FCFLAGS := -O3 -Minform=warn
#TARGET=gpu
TARGET=multicore
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
$(info Ampere detected)
GPU = cc80
endif
ifeq ($(findstring Turing,$(NVARCH)),Turing)
$(info Turing detected)
GPU = cc75
endif
ifeq ($(findstring Volta,$(NVARCH)),Volta)
$(info Volta detected)
GPU = cc70
endif
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
$(info Pascal detected)
GPU = cc60,cc61
endif
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
$(info Jetson AGX Orin detected)
GPU = ccn87,cc86
# figure out Xavier later
#GPU = cc72
endif
ifeq ($(GPU),)
$(error Your GPU architecture could not be detected. Set it manually.)
endif
GPUFLAG = -gpu=$(GPU)
# MARCH=neoverse-v1,neoverse-n1,zen3
ARCH=$(shell uname -m)
ifdef MARCH
ifeq ($(ARCH),aarch64)
ifeq ($(MARCH),neoverse-n1)
FCFLAGS += -tp=$(MARCH)
else
ifeq ($(MARCH),neoverse-v1)
FCFLAGS += -tp=$(MARCH)
else
FCFLAGS += -tp=native
endif
endif
else
FCFLAGS += -tp=$(MARCH)
endif
else
FCFLAGS += -tp=native
endif
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
# set exactly one of these!
#MANAGED = -DUSE_MANAGED -gpu=managed
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
endif

View File

@ -0,0 +1,32 @@
FC := ifx
FCFLAGS = -std18
FCFLAGS += -Ofast -xHOST
FCFLAGS += -qopt-zmm-usage=low
ifeq ($(FC),ifort)
FCFLAGS += -qopt-streaming-stores=always
PARALLEL = -parallel
endif
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
ARRAY_FLAG = -qopenmp $(PARALLEL)
OPENMP_FLAG = -qopenmp
ifeq ($(FC),ifx)
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
endif
OPENACC_FLAG =
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

35
src/fortran/run.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
cat ./run.sh
if [ `uname -s` == Darwin ] ; then
NUM_HWTHREADS=`sysctl -n hw.ncpu`
MEMORY_BYTES=`sysctl -n hw.memsize`
else
NUM_HWTHREADS=`nproc`
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
fi
M=128
export OMP_NUM_THREADS=8
export OMP_PROC_BIND=close
export OMP_PLACES=threads
export ACC_NUM_CORES=${OMP_NUM_THREADS}
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
#if [ "x$compiler" == "xgcc" ] ; then
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
#fi
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
if [ -f BabelStream.${compiler}.${implementation} ] ; then
echo "BabelStream.${compiler}.${implementation}"
ldd BabelStream.${compiler}.${implementation}
time $AFFCONTROL \
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
fi
done
done

View File

@ -0,0 +1,212 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
// Copyright (c) 2022 Troels Henriksen
// University of Copenhagen
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include <cstdlib> // For aligned_alloc
#include <string>
#include "FutharkStream.h"
template <class T>
FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
{
this->array_size = ARRAY_SIZE;
this->cfg = futhark_context_config_new();
this->device = "#" + std::to_string(device);
#if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
futhark_context_config_set_device(cfg, this->device.c_str());
#endif
this->ctx = futhark_context_new(cfg);
this->a = NULL;
this->b = NULL;
this->c = NULL;
}
template <>
FutharkStream<float>::~FutharkStream()
{
if (this->a) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->a);
}
if (this->b) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
}
if (this->c) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
}
futhark_context_free(this->ctx);
futhark_context_config_free(this->cfg);
}
template <>
FutharkStream<double>::~FutharkStream()
{
if (this->a) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
}
if (this->b) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
}
if (this->c) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
}
futhark_context_free(this->ctx);
futhark_context_config_free(this->cfg);
}
template <>
void FutharkStream<float>::init_arrays(float initA, float initB, float initC) {
int array_size = this->array_size;
float *a = new float[array_size];
float *b = new float[array_size];
float *c = new float[array_size];
for (int i = 0; i < array_size; i++) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
this->a = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, a, array_size);
this->b = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, b, array_size);
this->c = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, c, array_size);
futhark_context_sync(this->ctx);
delete[] a;
delete[] b;
delete[] c;
}
template <>
void FutharkStream<double>::init_arrays(double initA, double initB, double initC) {
int array_size = this->array_size;
double *a = new double[array_size];
double *b = new double[array_size];
double *c = new double[array_size];
for (int i = 0; i < array_size; i++) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
this->a = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, a, array_size);
this->b = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, b, array_size);
this->c = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, c, array_size);
futhark_context_sync(this->ctx);
delete[] a;
delete[] b;
delete[] c;
}
template <>
void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::copy() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_copy(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::copy() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
futhark_entry_f64_copy(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::mul() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
futhark_entry_f32_mul(this->ctx, (futhark_f32_1d**)&this->b, (futhark_f32_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::mul() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
futhark_entry_f64_mul(this->ctx, (futhark_f64_1d**)&this->b, (futhark_f64_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::add() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_add(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::add() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
futhark_entry_f64_add(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::triad() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_triad(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::triad() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
futhark_entry_f64_triad(this->ctx, (futhark_f64_1d**)&this->a, (futhark_f64_1d*)this->b, (futhark_f64_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::nstream() {
futhark_f32_1d* d;
futhark_entry_f32_triad(this->ctx, &d, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
this->c = d;
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::nstream() {
futhark_f64_1d* d;
futhark_entry_f64_triad(this->ctx, &d, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
this->c = d;
futhark_context_sync(this->ctx);
}
template <>
float FutharkStream<float>::dot() {
float res;
futhark_entry_f32_dot(this->ctx, &res, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
return res;
}
template <>
double FutharkStream<double>::dot() {
double res;
futhark_entry_f64_dot(this->ctx, &res, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_context_sync(this->ctx);
return res;
}
void listDevices(void)
{
std::cout << "Device selection not supported." << std::endl;
}
template class FutharkStream<float>;
template class FutharkStream<double>;

View File

@ -0,0 +1,60 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
// Copyright (c) 2022 Troels Henriksen
// University of Copenhagen
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#include "babelstream.h"
#if defined(FUTHARK_BACKEND_c)
#define IMPLEMENTATION_STRING "Futhark (sequential)"
#elif defined(FUTHARK_BACKEND_multicore)
#define IMPLEMENTATION_STRING "Futhark (parallel CPU)"
#elif defined(FUTHARK_BACKEND_opencl)
#define IMPLEMENTATION_STRING "Futhark (OpencL)"
#elif defined(FUTHARK_BACKEND_cuda)
#define IMPLEMENTATION_STRING "Futhark (CUDA)"
#else
#define IMPLEMENTATION_STRING "Futhark (unknown backend)"
#endif
template <class T>
class FutharkStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
// For device selection.
std::string device;
// Futhark stuff
struct futhark_context_config *cfg;
struct futhark_context *ctx;
// Device side arrays
void* a;
void* b;
void* c;
public:
FutharkStream(const int, int);
~FutharkStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -0,0 +1,62 @@
module type kernels = {
type t
val copy [n] : [n]t -> *[n]t
val mul [n] : t -> [n]t -> [n]t
val add [n] : [n]t -> [n]t -> [n]t
val triad [n] : t -> [n]t -> [n]t -> [n]t
val dot [n] : [n]t -> [n]t -> t
-- Uniqueness allows nstream to mutate the 'a' array.
val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t
}
module kernels (P: real) : kernels with t = P.t = {
type t = P.t
def copy = copy
def mul scalar c = map (P.*scalar) c
def add = map2 (P.+)
def triad scalar b c = map2 (P.+) b (map (P.* scalar) c)
def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b)
def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c))
}
module f32_kernels = kernels f32
def f32_start_scalar : f32 = 0.4
entry f32_copy = f32_kernels.copy
entry f32_mul = f32_kernels.mul f32_start_scalar
entry f32_add = f32_kernels.add
entry f32_triad = f32_kernels.triad f32_start_scalar
entry f32_nstream = f32_kernels.nstream f32_start_scalar
entry f32_dot = f32_kernels.dot
module f64_kernels = kernels f64
def f64_start_scalar : f64 = 0.4
entry f64_copy = f64_kernels.copy
entry f64_mul = f64_kernels.mul f64_start_scalar
entry f64_add = f64_kernels.add
entry f64_triad = f64_kernels.triad f64_start_scalar
entry f64_nstream = f64_kernels.nstream f64_start_scalar
entry f64_dot = f64_kernels.dot
-- ==
-- entry: f32_copy f32_mul
-- random input { [33554432]f32 }
-- ==
-- entry: f32_add f32_dot f32_triad
-- random input { [33554432]f32 [33554432]f32 }
-- ==
-- entry: f32_nstream
-- random input { [33554432]f32 [33554432]f32 [33554432]f32 }
-- ==
-- entry: f64_copy f64_mul
-- random input { [33554432]f64 }
-- ==
-- entry: f64_add f64_dot f64_triad
-- random input { [33554432]f64 [33554432]f64 }
-- ==
-- entry: f64_nstream
-- random input { [33554432]f64 [33554432]f64 [33554432]f64 }

55
src/futhark/model.cmake Normal file
View File

@ -0,0 +1,55 @@
# Use
#
# cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark
#
# to use the Futhark backend, where 'foo' must be one of 'multicore',
# 'c', 'opencl', or 'cuda'. Defaults to 'multicore'.
#
# Use -DFUTHARK_COMPILER to set the path to the Futhark compiler
# binary. Defaults to 'futhark' on the PATH.
register_flag_optional(FUTHARK_BACKEND
"Use a specific Futhark backend, possible options are:
- c
- multicore
- opencl
- cuda"
"multicore")
register_flag_optional(FUTHARK_COMPILER
"Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH"
"futhark")
macro(setup)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/babelstream.c
${CMAKE_CURRENT_BINARY_DIR}/babelstream.h
COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND}
--library src/futhark/babelstream.fut
-o ${CMAKE_CURRENT_BINARY_DIR}/babelstream
DEPENDS src/futhark/babelstream.fut
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
VERBATIM
)
if (${FUTHARK_BACKEND} STREQUAL "c")
# Nothing to do.
elseif (${FUTHARK_BACKEND} STREQUAL "multicore")
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
register_link_library(Threads::Threads)
elseif (${FUTHARK_BACKEND} STREQUAL "opencl")
find_package(OpenCL REQUIRED)
register_link_library(OpenCL::OpenCL)
elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
find_package(CUDA REQUIRED)
register_link_library("nvrtc" "cuda" "cudart")
else ()
message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
endif()
endmacro()
macro(setup_target)
target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c")
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
endmacro()

View File

@ -9,7 +9,7 @@
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#define TBSIZE 1024 #define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
void check_error(void) void check_error(void)
{ {
@ -45,34 +45,63 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
// Print out device information // Print out device information
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
// Allocate the host array for partial sums for dot kernels size_t array_bytes = sizeof(T);
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 3;
// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
check_error();
// Check buffers fit on the device // Check buffers fit on the device
hipDeviceProp_t props; hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0); hipGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers"); throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers // Create device buffers
hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); #if defined(MANAGED)
hipMallocManaged(&d_a, array_bytes);
check_error(); check_error();
hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); hipMallocManaged(&d_b, array_bytes);
check_error(); check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); hipMallocManaged(&d_c, array_bytes);
check_error(); check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); #elif defined(PAGEFAULT)
d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(array_bytes);
#else
hipMalloc(&d_a, array_bytes);
check_error(); check_error();
hipMalloc(&d_b, array_bytes);
check_error();
hipMalloc(&d_c, array_bytes);
check_error();
#endif
} }
template <class T> template <class T>
HIPStream<T>::~HIPStream() HIPStream<T>::~HIPStream()
{ {
free(sums); hipHostFree(sums);
check_error();
hipFree(d_a); hipFree(d_a);
check_error(); check_error();
@ -80,15 +109,13 @@ HIPStream<T>::~HIPStream()
check_error(); check_error();
hipFree(d_c); hipFree(d_c);
check_error(); check_error();
hipFree(d_sum);
check_error();
} }
template <typename T> template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = initA; a[i] = initA;
b[i] = initB; b[i] = initB;
c[i] = initC; c[i] = initC;
@ -97,7 +124,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
template <class T> template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC) void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -106,27 +133,37 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
template <class T> template <class T>
void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{ {
// Copy device memory to host // Copy device memory to host
#if defined(PAGEFAULT) || defined(MANAGED)
hipDeviceSynchronize();
for (int i = 0; i < array_size; i++)
{
a[i] = d_a[i];
b[i] = d_b[i];
c[i] = d_c[i];
}
#else
hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
#endif
} }
template <typename T> template <typename T>
__global__ void copy_kernel(const T * a, T * c) __global__ void copy_kernel(const T * a, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i]; c[i] = a[i];
} }
template <class T> template <class T>
void HIPStream<T>::copy() void HIPStream<T>::copy()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -136,14 +173,14 @@ template <typename T>
__global__ void mul_kernel(T * b, const T * c) __global__ void mul_kernel(T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
b[i] = scalar * c[i]; b[i] = scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::mul() void HIPStream<T>::mul()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -152,14 +189,14 @@ void HIPStream<T>::mul()
template <typename T> template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c) __global__ void add_kernel(const T * a, const T * b, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
} }
template <class T> template <class T>
void HIPStream<T>::add() void HIPStream<T>::add()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -169,14 +206,14 @@ template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c) __global__ void triad_kernel(T * a, const T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = b[i] + scalar * c[i]; a[i] = b[i] + scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::triad() void HIPStream<T>::triad()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -186,32 +223,32 @@ template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c) __global__ void nstream_kernel(T * a, const T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] += b[i] + scalar * c[i]; a[i] += b[i] + scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::nstream() void HIPStream<T>::nstream()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
} }
template <class T> template <typename T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{ {
__shared__ T tb_sum[TBSIZE]; __shared__ T tb_sum[TBSIZE];
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t local_i = threadIdx.x;
const size_t local_i = hipThreadIdx_x; size_t i = blockDim.x * blockIdx.x + local_i;
tb_sum[local_i] = 0.0; tb_sum[local_i] = {};
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i]; tb_sum[local_i] += a[i] * b[i];
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
{ {
__syncthreads(); __syncthreads();
if (local_i < offset) if (local_i < offset)
@ -221,20 +258,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
} }
if (local_i == 0) if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i]; sum[blockIdx.x] = tb_sum[local_i];
} }
template <class T> template <class T>
T HIPStream<T>::dot() T HIPStream<T>::dot()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
check_error();
hipDeviceSynchronize();
check_error(); check_error();
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); T sum{};
check_error(); for (int i = 0; i < dot_num_blocks; i++)
T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
sum += sums[i]; sum += sums[i];
return sum; return sum;

View File

@ -14,13 +14,31 @@
#include "Stream.h" #include "Stream.h"
#define IMPLEMENTATION_STRING "HIP" #define IMPLEMENTATION_STRING "HIP"
#define DOT_READ_DWORDS_PER_LANE 4
template <class T> template <class T>
class HIPStream : public Stream<T> class HIPStream : public Stream<T>
{ {
// Make sure that either:
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
// or
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
// Take into account the datatype size
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
// and 4 FP32 elements
static constexpr unsigned int dot_elements_per_lane{
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
protected: protected:
// Size of arrays // Size of arrays
int array_size; int array_size;
int dot_num_blocks;
// Host array for partial sums for dot kernel // Host array for partial sums for dot kernel
T *sums; T *sums;
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
T *d_a; T *d_a;
T *d_b; T *d_b;
T *d_c; T *d_c;
T *d_sum;
public: public:

View File

@ -2,6 +2,13 @@
register_flag_required(CMAKE_CXX_COMPILER register_flag_required(CMAKE_CXX_COMPILER
"Absolute path to the AMD HIP C++ compiler") "Absolute path to the AMD HIP C++ compiler")
register_flag_optional(MEM "Device memory mode:
DEFAULT - allocate host and device memory pointers.
MANAGED - use HIP Managed Memory.
PAGEFAULT - shared memory, only host pointers allocated."
"DEFAULT")
macro(setup) macro(setup)
# nothing to do here as hipcc does everything correctly, what a surprise! # nothing to do here as hipcc does everything correctly, what a surprise!
register_definitions(${MEM})
endmacro() endmacro()

View File

@ -7,12 +7,12 @@
<artifactId>java-stream</artifactId> <artifactId>java-stream</artifactId>
<groupId>javastream</groupId> <groupId>javastream</groupId>
<version>4.0</version> <version>5.0</version>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<junit.version>5.7.2</junit.version> <junit.version>5.9.2</junit.version>
</properties> </properties>
<repositories> <repositories>
@ -27,19 +27,19 @@
<dependency> <dependency>
<groupId>com.beust</groupId> <groupId>com.beust</groupId>
<artifactId>jcommander</artifactId> <artifactId>jcommander</artifactId>
<version>1.81</version> <version>1.82</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>tornado</groupId> <groupId>tornado</groupId>
<artifactId>tornado-api</artifactId> <artifactId>tornado-api</artifactId>
<version>0.9</version> <version>0.15.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.aparapi</groupId> <groupId>com.aparapi</groupId>
<artifactId>aparapi</artifactId> <artifactId>aparapi</artifactId>
<version>2.0.0</version> <version>3.0.0</version>
<exclusions> <exclusions>
<!-- don't pull in the entire Scala ecosystem! --> <!-- don't pull in the entire Scala ecosystem! -->
<exclusion> <exclusion>

View File

@ -56,7 +56,7 @@ public abstract class JavaStream<T> {
protected abstract T dot(); protected abstract T dot();
protected abstract Data<T> data(); protected abstract Data<T> readArrays();
public static class EnumeratedStream<T> extends JavaStream<T> { public static class EnumeratedStream<T> extends JavaStream<T> {
@ -113,8 +113,8 @@ public abstract class JavaStream<T> {
} }
@Override @Override
public Data<T> data() { public Data<T> readArrays() {
return actual.data(); return actual.readArrays();
} }
} }
@ -140,6 +140,14 @@ public abstract class JavaStream<T> {
return Duration.ofNanos(end - start); return Duration.ofNanos(end - start);
} }
final Duration runInitArrays() {
return timed(this::initArrays);
}
final SimpleImmutableEntry<Duration, Data<T>> runReadArrays() {
return timed(this::readArrays);
}
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) { final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
Timings<Duration> timings = new Timings<>(); Timings<Duration> timings = new Timings<>();
T lastSum = null; T lastSum = null;

View File

@ -128,6 +128,40 @@ public class Main {
} }
} }
@SuppressWarnings("unchecked")
static void showInit(
int totalBytes, double megaScale, Options opt, Duration init, Duration read) {
List<Entry<String, Double>> setup =
Arrays.asList(
new SimpleImmutableEntry<>("Init", durationToSeconds(init)),
new SimpleImmutableEntry<>("Read", durationToSeconds(read)));
if (opt.csv) {
tabulateCsv(
true,
setup.stream()
.map(
x ->
Arrays.asList(
new SimpleImmutableEntry<>("function", x.getKey()),
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
new SimpleImmutableEntry<>(
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec",
((megaScale * (double) totalBytes / x.getValue())) + ""),
new SimpleImmutableEntry<>("runtime", x.getValue() + "")))
.toArray(List[]::new));
} else {
for (Entry<String, Double> e : setup) {
System.out.printf(
"%s: %.5f s (%.5f M%sBytes/sec)%n",
e.getKey(),
e.getValue(),
megaScale * (double) totalBytes / e.getValue(),
opt.mibibytes ? "i" : "");
}
}
}
static <T extends Number> boolean run( static <T extends Number> boolean run(
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) { String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
@ -183,35 +217,46 @@ public class Main {
JavaStream<T> stream = mkStream.apply(config); JavaStream<T> stream = mkStream.apply(config);
stream.initArrays(); Duration init = stream.runInitArrays();
final boolean ok; final boolean ok;
switch (config.benchmark) { switch (config.benchmark) {
case ALL: case ALL:
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes); {
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue())); Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
Timings<Duration> timings = results.getKey(); SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
tabulateCsv( showInit(totalBytes, megaScale, opt, init, read.getKey());
opt.csv, ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue()));
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), Timings<Duration> timings = results.getKey();
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), tabulateCsv(
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), opt.csv,
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
break; mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
}
case NSTREAM: case NSTREAM:
List<Duration> nstreamResults = stream.runNStream(opt.numtimes); {
ok = checkSolutions(stream.data(), config, Optional.empty()); List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
break; showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
}
case TRIAD: case TRIAD:
Duration triadResult = stream.runTriad(opt.numtimes); {
ok = checkSolutions(stream.data(), config, Optional.empty()); Duration triadResult = stream.runTriad(opt.numtimes);
int triadTotalBytes = 3 * arrayBytes * opt.numtimes; SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); showInit(totalBytes, megaScale, opt, init, read.getKey());
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); ok = checkSolutions(read.getValue(), config, Optional.empty());
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
break; double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
}
default: default:
throw new AssertionError(); throw new AssertionError();
} }
@ -337,7 +382,7 @@ public class Main {
} }
} }
private static final String VERSION = "4.0"; private static final String VERSION = "5.0";
private static final float START_SCALAR = 0.4f; private static final float START_SCALAR = 0.4f;
private static final float START_A = 0.1f; private static final float START_A = 0.1f;

View File

@ -122,7 +122,7 @@ public final class AparapiStreams {
} }
@Override @Override
public Data<T> data() { public Data<T> readArrays() {
return kernels.syncAndDispose(); return kernels.syncAndDispose();
} }
} }

View File

@ -86,7 +86,7 @@ final class GenericPlainStream<T extends Number> extends JavaStream<T> {
} }
@Override @Override
public Data<T> data() { public Data<T> readArrays() {
return new Data<>(a, b, c); return new Data<>(a, b, c);
} }
} }

View File

@ -80,7 +80,7 @@ final class GenericStream<T extends Number> extends JavaStream<T> {
} }
@Override @Override
public Data<T> data() { public Data<T> readArrays() {
return new Data<>(a, b, c); return new Data<>(a, b, c);
} }
} }

View File

@ -78,7 +78,7 @@ final class SpecialisedDoubleStream extends JavaStream<Double> {
} }
@Override @Override
public Data<Double> data() { public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }
} }

View File

@ -78,7 +78,7 @@ final class SpecialisedFloatStream extends JavaStream<Float> {
} }
@Override @Override
public Data<Float> data() { public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }
} }

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
} }
@Override @Override
public Data<Double> data() { public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }
} }

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainFloatStream extends JavaStream<Float> {
} }
@Override @Override
public Data<Float> data() { public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }
} }

View File

@ -4,8 +4,8 @@ import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import javastream.JavaStream; import javastream.JavaStream;
import javastream.Main.Config; import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskSchedule; import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI; import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.TornadoDevice; import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
protected final TornadoDevice device; protected final TornadoDevice device;
protected TaskSchedule copyTask; protected TornadoExecutionPlan copyTask;
protected TaskSchedule mulTask; protected TornadoExecutionPlan mulTask;
protected TaskSchedule addTask; protected TornadoExecutionPlan addTask;
protected TaskSchedule triadTask; protected TornadoExecutionPlan triadTask;
protected TaskSchedule nstreamTask; protected TornadoExecutionPlan nstreamTask;
protected TaskSchedule dotTask; protected TornadoExecutionPlan dotTask;
GenericTornadoVMStream(Config<T> config) { GenericTornadoVMStream(Config<T> config) {
super(config); super(config);
try { try {
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime(); TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime();
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime); List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
device = devices.get(config.options.device); device = devices.get(config.options.device);
@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
} }
} }
protected static TaskSchedule mkSchedule() {
return new TaskSchedule("");
}
@Override @Override
public List<String> listDevices() { public List<String> listDevices() {
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
@ -55,12 +51,12 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
@Override @Override
public void initArrays() { public void initArrays() {
this.copyTask.warmup(); this.copyTask.withWarmUp();
this.mulTask.warmup(); this.mulTask.withWarmUp();
this.addTask.warmup(); this.addTask.withWarmUp();
this.triadTask.warmup(); this.triadTask.withWarmUp();
this.nstreamTask.warmup(); this.nstreamTask.withWarmUp();
this.dotTask.warmup(); this.dotTask.withWarmUp();
} }
@Override @Override

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays; import java.util.Arrays;
import javastream.Main.Config; import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce; import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedDouble extends GenericTornadoVMStream<Double> { final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
@ -49,7 +52,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
private final double[] a, b, c; private final double[] a, b, c;
private final double[] dotSum; private final double[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) @SuppressWarnings({"DuplicatedCode"})
SpecialisedDouble(Config<Double> config) { SpecialisedDouble(Config<Double> config) {
super(config); super(config);
final int size = config.options.arraysize; final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
b = new double[size]; b = new double[size];
c = new double[size]; c = new double[size];
dotSum = new double[1]; dotSum = new double[1];
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c); this.copyTask =
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar); new TornadoExecutionPlan(
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c); new TaskGraph("copy")
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar); .task("copy", SpecialisedDouble::copy, size, a, c)
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar); .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum); .snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedDouble::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedDouble::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedDouble::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedDouble::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
} }
@Override @Override
@ -72,7 +106,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
Arrays.fill(a, config.initA); Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB); Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC); Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c); TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
} }
@Override @Override
@ -81,7 +115,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
} }
@Override @Override
public Data<Double> data() { public Data<Double> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c); TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays; import java.util.Arrays;
import javastream.Main.Config; import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce; import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedFloat extends GenericTornadoVMStream<Float> { final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
@ -49,7 +52,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
private final float[] a, b, c; private final float[] a, b, c;
private final float[] dotSum; private final float[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) @SuppressWarnings({"DuplicatedCode"})
SpecialisedFloat(Config<Float> config) { SpecialisedFloat(Config<Float> config) {
super(config); super(config);
final int size = config.options.arraysize; final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
b = new float[size]; b = new float[size];
c = new float[size]; c = new float[size];
dotSum = new float[1]; dotSum = new float[1];
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c); this.copyTask =
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar); new TornadoExecutionPlan(
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c); new TaskGraph("copy")
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar); .task("copy", SpecialisedFloat::copy, size, a, c)
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar); .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum); .snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedFloat::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedFloat::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedFloat::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedFloat::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
} }
@Override @Override
@ -72,7 +106,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
Arrays.fill(a, config.initA); Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB); Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC); Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c); TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
} }
@Override @Override
@ -81,7 +115,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
} }
@Override @Override
public Data<Float> data() { public Data<Float> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c); TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c)); return new Data<>(boxed(a), boxed(b), boxed(c));
} }

View File

@ -1,36 +1,46 @@
package javastream.tornadovm; package javastream.tornadovm;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import javastream.JavaStream; import javastream.JavaStream;
import javastream.Main.Config; import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI; import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.TornadoDevice; import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState; import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState;
import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
public final class TornadoVMStreams { public final class TornadoVMStreams {
private TornadoVMStreams() {} private TornadoVMStreams() {}
static void xferToDevice(TornadoDevice device, Object... xs) { static void allocAndXferToDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) { for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.allocateObjects(
new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)});
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
} }
} }
static void xferFromDevice(TornadoDevice device, Object... xs) { static void xferFromDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) { Arrays.stream(xs)
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); .map(
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn(); x -> {
} TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
return device.resolveEvent(
device.streamOut(x, 0, state.getDeviceState(device), null));
})
.collect(Collectors.toList())
.forEach(Event::waitOn);
} }
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) { static List<TornadoDevice> enumerateDevices(TornadoRuntimeInterface runtime) {
return IntStream.range(0, runtime.getNumDrivers()) return IntStream.range(0, runtime.getNumDrivers())
.mapToObj(runtime::getDriver) .mapToObj(runtime::getDriver)
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))

View File

@ -1,415 +1,423 @@
# This file is machine-generated - editing it directly is not advised # This file is machine-generated - editing it directly is not advised
[[AMDGPU]] julia_version = "1.9.3"
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] manifest_format = "2.0"
git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" project_hash = "05982ec0602af8ada9509107382dd6c8b21db9b9"
[[deps.AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Preferences", "Printf", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "95437cf4c0ad651ca8463475de8af6a6935e23bd"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.17" version = "0.6.1"
[[AbstractFFTs]] [[deps.AbstractFFTs]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1" version = "1.5.0"
[[Adapt]] [deps.AbstractFFTs.extensions]
deps = ["LinearAlgebra"] AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" AbstractFFTsTestExt = "Test"
[deps.AbstractFFTs.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1" version = "3.6.2"
weakdeps = ["StaticArrays"]
[[ArgParse]] [deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"] deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4" version = "1.1.4"
[[ArgTools]] [[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]] [[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]] [[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]] [[deps.CEnum]]
deps = ["Libdl", "Logging", "SHA"] git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1" version = "0.4.2"
[[ConstructionBase]] [[deps.CompilerSupportLibraries_jll]]
deps = ["LinearAlgebra"] deps = ["Artifacts", "Libdl"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" version = "1.0.5+0"
version = "1.3.0"
[[Dates]] [[deps.Dates]]
deps = ["Printf"] deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[Downloads]] [[deps.DocStringExtensions]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"] deps = ["LibGit2"]
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.9.3"
[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[Elfutils_jll]] [[deps.ExprTools]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6" version = "0.1.10"
[[Future]] [[deps.FileWatching]]
deps = ["Random"] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]] [[deps.GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2" version = "9.0.0"
[[GPUCompiler]] [[deps.GPUArraysCore]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] deps = ["Adapt"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9" version = "0.24.5"
[[HIP_jll]] [[deps.InteractiveUtils]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JLLWrappers]] [[deps.IrrationalConstants]]
deps = ["Preferences"] git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.2.2"
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0" version = "1.5.0"
[[LLVM]] [[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLD_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109"
version = "14.0.6+3"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.7.0" version = "6.2.1"
[[LLVMExtra_jll]] [[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.13+0" version = "0.0.25+0"
[[LibCURL]] [[deps.LLVM_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c"
uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c"
version = "14.0.6+4"
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"] deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]] [[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]] [[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"] deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]] [[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"] deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]] [[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]] [[deps.LinearAlgebra]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]] [[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.26"
[deps.LogExpFunctions.extensions]
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
[deps.LogExpFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]] [[deps.MacroTools]]
deps = ["Markdown", "Random"] deps = ["Markdown", "Random"]
git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.9" version = "0.5.11"
[[Markdown]] [[deps.Markdown]]
deps = ["Base64"] deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]] [[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[MozillaCACerts_jll]] [[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159" uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NUMA_jll]] [[deps.NetworkOptions]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OrderedCollections]] [[deps.OpenBLAS_jll]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[deps.OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
version = "0.8.1+0"
[[deps.OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[deps.OrderedCollections]]
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1" version = "1.6.2"
[[Parameters]] [[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"] deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3" version = "0.12.3"
[[Pkg]] [[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]] [[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"] deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250" uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2" version = "1.4.1"
[[Printf]] [[deps.Printf]]
deps = ["Unicode"] deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]] [[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCmCompilerSupport_jll]] [[deps.Random]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] deps = ["SHA", "Serialization"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Requires]] [[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[deps.Requires]]
deps = ["UUIDs"] deps = ["UUIDs"]
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df" uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0" version = "1.2.0"
[[SHA]] [[deps.Serialization]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]] [[deps.Sockets]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc" uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]] [[deps.SparseArrays]]
deps = ["LinearAlgebra", "Random"] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[Statistics]] [[deps.SpecialFunctions]]
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "2.3.1"
[deps.SpecialFunctions.extensions]
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
[deps.SpecialFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.6.4"
weakdeps = ["Statistics"]
[deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]] [[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"] deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]] [[deps.Tar]]
deps = ["ArgTools", "SHA"] deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[TextWrap]] [[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d" uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1" version = "1.0.1"
[[TimerOutputs]] [[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"] deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13" version = "0.5.23"
[[UUIDs]] [[deps.UUIDs]]
deps = ["Random", "SHA"] deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]] [[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2" version = "1.0.2"
[[Unicode]] [[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]] [[deps.UnsafeAtomics]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" version = "0.2.1"
version = "2.9.12+0"
[[XSLT_jll]] [[deps.UnsafeAtomicsLLVM]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61" uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "1.1.34+0" version = "0.1.3"
[[XZ_jll]] [[deps.Zlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
deps = ["Libdl"] deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a" uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[argp_standalone_jll]] [[deps.libLLVM_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "Libdl"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" version = "14.0.6+3"
version = "1.3.1+0"
[[fts_jll]] [[deps.libblastrampoline_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "Libdl"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" version = "5.8.0+0"
version = "1.2.7+1"
[[hsa_rocr_jll]] [[deps.nghttp2_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.2.0+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[obstack_jll]] [[deps.p7zip_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat] [compat]
julia = "1.6" julia = "1.9"

View File

@ -1,332 +1,555 @@
# This file is machine-generated - editing it directly is not advised # This file is machine-generated - editing it directly is not advised
[[AbstractFFTs]] julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "6909ef39c97ad6037791040bed70b7aa111e1f64"
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1" version = "1.5.0"
[[Adapt]] [deps.AbstractFFTs.extensions]
deps = ["LinearAlgebra"] AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" AbstractFFTsTestExt = "Test"
[deps.AbstractFFTs.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1" version = "3.6.2"
weakdeps = ["StaticArrays"]
[[ArgParse]] [deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"] deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4" version = "1.1.4"
[[ArgTools]] [[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]] [[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]] [[deps.Atomix]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"] deps = ["UnsafeAtomics"]
git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.2.0" version = "0.1.0"
[[Base64]] [[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.4.2"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]] [[deps.CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1" version = "0.4.2"
[[CUDA]] [[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "1f8ebf85abb7d1eff965730e592794a27c1350d8" git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.6.0" version = "5.0.0"
[[ChainRulesCore]] [deps.CUDA.extensions]
deps = ["Compat", "LinearAlgebra", "SparseArrays"] SpecialFunctionsExt = "SpecialFunctions"
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.11.2"
[[ChangesOfVariables]] [deps.CUDA.weakdeps]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"] SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[Compat]] [[deps.CUDA_Driver_jll]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" git-tree-sha1 = "35a37bb72b35964f2895c12c687ae263b4ac170c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.6.0+3"
[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.2"
[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "bfe5a693a11522d58392f742243f2b50dc27afd6"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.9.2+0"
[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"
[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"
[[deps.Compat]]
deps = ["UUIDs"]
git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.41.0" version = "4.9.0"
weakdeps = ["Dates", "LinearAlgebra"]
[[CompilerSupportLibraries_jll]] [deps.Compat.extensions]
CompatLinearAlgebraExt = "LinearAlgebra"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"
[[Dates]] [[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"
[[deps.DataAPI]]
git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.15.0"
[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"
[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.15"
[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"
[[deps.Dates]]
deps = ["Printf"] deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]] [[deps.Downloads]]
deps = ["Mmap"] deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[ExprTools]] [[deps.ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6" version = "0.1.10"
[[GPUArrays]] [[deps.FileWatching]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"
[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2" version = "9.0.0"
[[GPUCompiler]] [[deps.GPUArraysCore]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] deps = ["Adapt"]
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.13.10" version = "0.24.5"
[[InteractiveUtils]] [[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"
[[deps.InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InverseFunctions]] [[deps.InvertedIndices]]
deps = ["Test"] git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"
[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0" version = "1.3.0"
[[LLVM]] [[deps.IteratorInterfaceExtensions]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" uuid = "82899510-4779-5014-852e-03e436cf321d"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" version = "1.0.0"
version = "4.7.0"
[[LLVMExtra_jll]] [[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"
[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.0.13+0" version = "0.2.1+0"
[[LazyArtifacts]] [[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.2.1"
[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.25+0"
[[deps.LaTeXStrings]]
git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.0"
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"] deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]] [[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"] deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]] [[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]] [[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"] deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]] [[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"] deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]] [[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]] [[deps.LinearAlgebra]]
deps = ["Libdl"] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]] [[deps.Logging]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]] [[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.11"
[[deps.Markdown]]
deps = ["Base64"] deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]] [[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[Mmap]] [[deps.Missings]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804" deps = ["DataAPI"]
git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.1.0"
[[MozillaCACerts_jll]] [[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159" uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NetworkOptions]] [[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.3"
[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OpenLibm_jll]] [[deps.OpenBLAS_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112" uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[OpenSpecFun_jll]] [[deps.OrderedCollections]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1" version = "1.6.2"
[[Parameters]] [[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"] deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3" version = "0.12.3"
[[Pkg]] [[deps.Parsers]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.7.2"
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]] [[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"] deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250" uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2" version = "1.4.1"
[[Printf]] [[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.2.7"
[[deps.Printf]]
deps = ["Unicode"] deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]] [[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]] [[deps.Random]]
deps = ["Serialization"] deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]] [[deps.Random123]]
deps = ["Libdl", "Random", "RandomNumbers"] deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
uuid = "74087812-796a-5b5d-8853-05524746bad3" uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2" version = "1.6.1"
[[RandomNumbers]] [[deps.RandomNumbers]]
deps = ["Random", "Requires"] deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3" version = "1.5.3"
[[Reexport]] [[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69" uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2" version = "1.2.2"
[[Requires]] [[deps.Requires]]
deps = ["UUIDs"] deps = ["UUIDs"]
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df" uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0" version = "1.2.0"
[[SHA]] [[deps.SentinelArrays]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" deps = ["Dates", "Random"]
git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.0"
[[Serialization]] [[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]] [[deps.Sockets]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc" uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]] [[deps.SortingAlgorithms]]
deps = ["LinearAlgebra", "Random"] deps = ["DataStructures"]
git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.1.1"
[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]] [[deps.StaticArrays]]
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e" git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b" uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "2.0.0" version = "1.6.4"
weakdeps = ["Statistics"]
[[Statistics]] [deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]] [[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"
[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"] deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]] [[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"
[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "a1f34829d5ac0ef499f6d84428bd6b4c71f02ead"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.0"
[[deps.Tar]]
deps = ["ArgTools", "SHA"] deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[Test]] [[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]] [[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d" uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1" version = "1.0.1"
[[TimerOutputs]] [[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"] deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13" version = "0.5.23"
[[UUIDs]] [[deps.UUIDs]]
deps = ["Random", "SHA"] deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]] [[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2" version = "1.0.2"
[[Unicode]] [[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]] [[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"
[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[deps.Zlib_jll]]
deps = ["Libdl"] deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a" uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[nghttp2_jll]] [[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[p7zip_jll]] [[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat] [compat]
julia = "1.6" julia = "1.9"

File diff suppressed because it is too large Load Diff

View File

@ -8,4 +8,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
[compat] [compat]
julia = "1.6" julia = "1.9"

File diff suppressed because it is too large Load Diff

View File

@ -16,4 +16,4 @@ ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat] [compat]
julia = "1.6" julia = "1.9"

View File

@ -1,31 +1,35 @@
# This file is machine-generated - editing it directly is not advised # This file is machine-generated - editing it directly is not advised
[[ArgParse]] julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"] deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4" version = "1.1.4"
[[Logging]] [[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[OrderedCollections]] [[deps.OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1" version = "1.6.2"
[[Parameters]] [[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"] deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3" version = "0.12.3"
[[TextWrap]] [[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d" uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1" version = "1.0.1"
[[UnPack]] [[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2" version = "1.0.2"

View File

@ -3,4 +3,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat] [compat]
julia = "1.6" julia = "1.9"

View File

@ -1,335 +1,441 @@
# This file is machine-generated - editing it directly is not advised # This file is machine-generated - editing it directly is not advised
[[Adapt]] julia_version = "1.9.3"
deps = ["LinearAlgebra"] manifest_format = "2.0"
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" project_hash = "01f328e925b86927b3f24c30aee6ecdce5bd28cc"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]] [[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.6.2"
weakdeps = ["StaticArrays"]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"] deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4" version = "1.1.4"
[[ArgTools]] [[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]] [[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]] [[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]] [[deps.CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1" version = "0.4.2"
[[ChainRulesCore]] [[deps.CompilerSupportLibraries_jll]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.11.2"
[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.41.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"
[[Dates]] [[deps.Dates]]
deps = ["Printf"] deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]] [[deps.DocStringExtensions]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"] deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6" version = "0.9.3"
[[Downloads]] [[deps.Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"] deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[ExprTools]] [[deps.ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6" version = "0.1.10"
[[GPUArrays]] [[deps.FileWatching]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2" version = "8.8.1"
[[GPUCompiler]] [[deps.GPUArraysCore]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] deps = ["Adapt"]
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.13.10" version = "0.21.4"
[[InteractiveUtils]] [[deps.InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InverseFunctions]] [[deps.IrrationalConstants]]
deps = ["Test"] git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"
[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1" version = "0.2.2"
[[JLLWrappers]] [[deps.JLLWrappers]]
deps = ["Preferences"] deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0" version = "1.5.0"
[[LLVM]] [[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.7.0" version = "6.2.1"
[[LLVMExtra_jll]] [[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.13+0" version = "0.0.25+0"
[[LibCURL]] [[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"] deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]] [[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]] [[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"] deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]] [[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"] deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]] [[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]] [[deps.LinearAlgebra]]
deps = ["Libdl"] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]] [[deps.LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6" version = "0.3.26"
[[Logging]] [deps.LogExpFunctions.extensions]
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
[deps.LogExpFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]] [[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.11"
[[deps.Markdown]]
deps = ["Base64"] deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]] [[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[Mmap]] [[deps.MozillaCACerts_jll]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159" uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NEO_jll]] [[deps.NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c" git-tree-sha1 = "9846d87fd254cdaa1879dff93999e1bc32ed2658"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.44.21506+0" version = "23.17.26241+0"
[[NetworkOptions]] [[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OpenLibm_jll]] [[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[deps.OpenLibm_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112" uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
version = "0.8.1+0"
[[OpenSpecFun_jll]] [[deps.OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0" version = "0.5.5+0"
[[OrderedCollections]] [[deps.OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1" version = "1.6.2"
[[Parameters]] [[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"] deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3" version = "0.12.3"
[[Pkg]] [[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]] [[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"] deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250" uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2" version = "1.4.1"
[[Printf]] [[deps.Printf]]
deps = ["Unicode"] deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]] [[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]] [[deps.Random]]
deps = ["Serialization"] deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[SHA]] [[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[SPIRV_LLVM_Translator_jll]] [[deps.SPIRV_LLVM_Translator_unified_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" git-tree-sha1 = "fe95f28a96975bd1d473e9273873b36402b79a54"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
version = "11.0.0+2" version = "0.3.0+0"
[[SPIRV_Tools_jll]] [[deps.SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "Libdl"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0" version = "2023.2.0+0"
[[Serialization]] [[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]] [[deps.Sockets]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc" uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]] [[deps.SparseArrays]]
deps = ["LinearAlgebra", "Random"] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]] [[deps.SpecialFunctions]]
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b" uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.8.1" version = "2.3.1"
[[Statistics]] [deps.SpecialFunctions.extensions]
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
[deps.SpecialFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.6.4"
weakdeps = ["Statistics"]
[deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]] [[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"] deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]] [[deps.Tar]]
deps = ["ArgTools", "SHA"] deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[Test]] [[deps.TextWrap]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d" uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1" version = "1.0.1"
[[TimerOutputs]] [[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"] deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13" version = "0.5.23"
[[UUIDs]] [[deps.UUIDs]]
deps = ["Random", "SHA"] deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]] [[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2" version = "1.0.2"
[[Unicode]] [[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]] [[deps.UnsafeAtomics]]
deps = ["Libdl"] git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "83775a58-1f1d-513f-b197-d71354ab007a" uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "21.2.1+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.8744+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "0.2.1" version = "0.2.1"
[[oneAPI_Level_Zero_Headers_jll]] [[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[deps.gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464" git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "22.3.0+0"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[[deps.libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7c0b5fa2ff90d96af106fd4a67ff6923cd3f9cb9"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.13822+0"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[deps.oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LinearAlgebra", "NEO_jll", "Preferences", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "UnsafeAtomicsLLVM", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"]
git-tree-sha1 = "9e6a675faf3ea27d08018c9bd0a03596003ff5cf"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "1.3.0"
[[deps.oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.2.43+0" version = "1.6.3+0"
[[oneAPI_Level_Zero_Loader_jll]] [[deps.oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889" git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.5.0+0" version = "1.11.0+0"
[[p7zip_jll]] [[deps.oneAPI_Support_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219"
uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
version = "0.2.2+0"
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"] deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat] [compat]
julia = "1.6" julia = "1.9"

View File

@ -20,6 +20,18 @@ end
@enum Benchmark All Triad Nstream @enum Benchmark All Triad Nstream
function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C}
return @elapsed init_arrays!(data, context, init)
end
function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C}
elapsed = @elapsed begin
result = read_data(data, context)
end
return (elapsed, result)
end
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
timings = Timings(times) timings = Timings(times)
lastSum::T = 0 lastSum::T = 0
@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {
end end
end end
function run_nstream!( function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C}
data::StreamData{T,C},
context,
times::Int,
)::Vector{Float64} where {T,C}
timings::Vector{Float64} = zeros(times) timings::Vector{Float64} = zeros(times)
for i = 1:times for i = 1:times
@inbounds timings[i] = @elapsed nstream!(data, context) @inbounds timings[i] = @elapsed nstream!(data, context)
@ -93,9 +101,7 @@ function check_solutions(
error = abs((dot - gold_sum) / gold_sum) error = abs((dot - gold_sum) / gold_sum)
failed = error > 1.0e-8 failed = error > 1.0e-8
if failed if failed
println( println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum")
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
)
end end
!failed !failed
end : true end : true
@ -158,7 +164,7 @@ end
const DefaultInit = (0.1, 0.2, 0.0) const DefaultInit = (0.1, 0.2, 0.0)
const DefaultScalar = 0.4 const DefaultScalar = 0.4
const Version = "4.0" const Version = "5.0"
function main() function main()
@ -166,7 +172,7 @@ function main()
parse_options(config) parse_options(config)
if config.list if config.list
for (i, (_,repr, impl)) in enumerate(devices()) for (i, (_, repr, impl)) in enumerate(devices())
println("[$i] ($impl) $repr") println("[$i] ($impl) $repr")
end end
exit(0) exit(0)
@ -175,9 +181,7 @@ function main()
ds = devices() ds = devices()
# TODO implement substring device match # TODO implement substring device match
if config.device < 1 || config.device > length(ds) if config.device < 1 || config.device > length(ds)
error( error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed")
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
)
else else
device = ds[config.device] device = ds[config.device]
end end
@ -220,10 +224,10 @@ function main()
end end
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
tail = Base.rest(xs) tail = Iterators.rest(xs)
min = Iterators.minimum(tail) min = Base.minimum(tail)
max = Iterators.maximum(tail) max = Base.maximum(tail)
avg = Iterators.sum(tail) / Iterators.length(tail) avg = Base.sum(tail) / Base.length(tail)
mbps = mega_scale * total_bytes / min mbps = mega_scale * total_bytes / min
if config.csv if config.csv
return [ return [
@ -257,16 +261,42 @@ function main()
end end
end end
function show_init(init::Float64, read::Float64)
setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)]
if config.csv
tabulate(
map(
x -> [
("phase", x[1]),
("n_elements", config.arraysize),
("sizeof", x[3]),
("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]),
("runtime", x[2]),
],
setup,
)...,
)
else
for (name, elapsed, total_bytes) in setup
println(
"$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)",
)
end
end
end
init::Tuple{type,type,type} = DefaultInit init::Tuple{type,type,type} = DefaultInit
scalar::type = DefaultScalar scalar::type = DefaultScalar
GC.enable(false) GC.enable(false)
(data, context) = make_stream(config.arraysize, scalar, device, config.csv) (data, context) = make_stream(config.arraysize, scalar, device, config.csv)
init_arrays!(data, context, init) tInit = run_init_arrays!(data, context, init)
if benchmark == All if benchmark == All
(timings, sum) = run_all!(data, context, config.numtimes) (timings, sum) = run_all!(data, context, config.numtimes)
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) (tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, sum)
tabulate( tabulate(
mk_row(timings.copy, "Copy", 2 * array_bytes), mk_row(timings.copy, "Copy", 2 * array_bytes),
mk_row(timings.mul, "Mul", 2 * array_bytes), mk_row(timings.mul, "Mul", 2 * array_bytes),
@ -276,13 +306,15 @@ function main()
) )
elseif benchmark == Nstream elseif benchmark == Nstream
timings = run_nstream!(data, context, config.numtimes) timings = run_nstream!(data, context, config.numtimes)
valid = (tRead, result) = run_read_data(data, context)
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
elseif benchmark == Triad elseif benchmark == Triad
elapsed = run_triad!(data, context, config.numtimes) elapsed = run_triad!(data, context, config.numtimes)
valid = (tRead, result) = run_read_data(data, context)
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
total_bytes = 3 * array_bytes * config.numtimes total_bytes = 3 * array_bytes * config.numtimes
bandwidth = mega_scale * (total_bytes / elapsed) bandwidth = mega_scale * (total_bytes / elapsed)
println("Runtime (seconds): $(round(elapsed; digits=5))") println("Runtime (seconds): $(round(elapsed; digits=5))")
@ -290,7 +322,6 @@ function main()
else else
error("Bad benchmark $(benchmark)") error("Bad benchmark $(benchmark)")
end end
GC.enable(true) GC.enable(true)
if !valid if !valid

View File

@ -3,5 +3,6 @@
for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions"
do do
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' echo "Updating subproject $BACKEND"
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
done done

View File

@ -1,4 +1,4 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
// University of Bristol HPC // University of Bristol HPC
// //
// For full license terms please see the LICENSE file distributed with this // For full license terms please see the LICENSE file distributed with this
@ -14,9 +14,9 @@ KokkosStream<T>::KokkosStream(
{ {
Kokkos::initialize(); Kokkos::initialize();
d_a = new Kokkos::View<T*>("d_a", ARRAY_SIZE); d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
d_b = new Kokkos::View<T*>("d_b", ARRAY_SIZE); d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
d_c = new Kokkos::View<T*>("d_c", ARRAY_SIZE); d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
hm_a = new typename Kokkos::View<T*>::HostMirror(); hm_a = new typename Kokkos::View<T*>::HostMirror();
hm_b = new typename Kokkos::View<T*>::HostMirror(); hm_b = new typename Kokkos::View<T*>::HostMirror();
hm_c = new typename Kokkos::View<T*>::HostMirror(); hm_c = new typename Kokkos::View<T*>::HostMirror();
@ -140,7 +140,7 @@ T KokkosStream<T>::dot()
Kokkos::View<T*> a(*d_a); Kokkos::View<T*> a(*d_a);
Kokkos::View<T*> b(*d_b); Kokkos::View<T*> b(*d_b);
T sum = 0.0; T sum{};
Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp) Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp)
{ {

View File

@ -10,9 +10,6 @@
#include <stdexcept> #include <stdexcept>
#include <Kokkos_Core.hpp> #include <Kokkos_Core.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_View.hpp>
#include "Stream.h" #include "Stream.h"
#define IMPLEMENTATION_STRING "Kokkos" #define IMPLEMENTATION_STRING "Kokkos"

View File

@ -1,32 +1,38 @@
register_flag_optional(CMAKE_CXX_COMPILER register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA. "Any CXX compiler that is supported by CMake detection and RAJA.
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++") "c++")
register_flag_required(KOKKOS_IN_TREE register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos. "Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example: Remember to append Kokkos specific flags as well, for example:
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
# compiler vendor and arch specific flags # compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
macro(setup) macro(setup)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
if (EXISTS "${KOKKOS_IN_TREE}") if (EXISTS "${KOKKOS_IN_TREE}")
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos) register_link_library(Kokkos::kokkos)
else () elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else()
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif () endif ()
register_append_compiler_and_arch_specific_cxx_flags( register_append_compiler_and_arch_specific_cxx_flags(
@ -36,5 +42,3 @@ macro(setup)
) )
endmacro() endmacro()

View File

@ -15,7 +15,7 @@
#include <iomanip> #include <iomanip>
#include <cstring> #include <cstring>
#define VERSION_STRING "4.0" #define VERSION_STRING "5.0"
#include "Stream.h" #include "Stream.h"
@ -49,6 +49,8 @@
#include "SYCLStream2020.h" #include "SYCLStream2020.h"
#elif defined(OMP) #elif defined(OMP)
#include "OMPStream.h" #include "OMPStream.h"
#elif defined(FUTHARK)
#include "FutharkStream.h"
#endif #endif
// Default size of 2^25 // Default size of 2^25
@ -222,10 +224,10 @@ void run()
{ {
// MiB = 2^20 // MiB = 2^20
std::cout << std::setprecision(1) << std::fixed std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" << "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
<< " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; << " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
} }
else else
{ {
@ -298,12 +300,18 @@ void run()
// Use the OpenMP implementation // Use the OpenMP implementation
stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex); stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(FUTHARK)
// Use the Futhark implementation
stream = new FutharkStream<T>(ARRAY_SIZE, deviceIndex);
#endif #endif
auto init1 = std::chrono::high_resolution_clock::now();
stream->init_arrays(startA, startB, startC); stream->init_arrays(startA, startB, startC);
auto init2 = std::chrono::high_resolution_clock::now();
// Result of the Dot kernel, if used. // Result of the Dot kernel, if used.
T sum = 0.0; T sum{};
std::vector<std::vector<double>> timings; std::vector<std::vector<double>> timings;
@ -327,7 +335,54 @@ void run()
std::vector<T> c(ARRAY_SIZE); std::vector<T> c(ARRAY_SIZE);
auto read1 = std::chrono::high_resolution_clock::now();
stream->read_arrays(a, b, c); stream->read_arrays(a, b, c);
auto read2 = std::chrono::high_resolution_clock::now();
auto initElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(read2 - read1).count();
auto readElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(init2 - init1).count();
auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS;
auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS;
if (output_as_csv)
{
std::cout
<< "phase" << csv_separator
<< "n_elements" << csv_separator
<< "sizeof" << csv_separator
<< ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
<< "runtime" << std::endl;
std::cout
<< "Init" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< initBWps << csv_separator
<< initElapsedS << std::endl;
std::cout
<< "Read" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< readBWps << csv_separator
<< readElapsedS << std::endl;
}
else
{
std::cout << "Init: "
<< std::setw(7)
<< initElapsedS
<< " s (="
<< initBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
std::cout << "Read: "
<< std::setw(7)
<< readElapsedS
<< " s (="
<< readBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
}
check_solution<T>(num_times, a, b, c, sum); check_solution<T>(num_times, a, b, c, sum);
// Display timing results // Display timing results
@ -393,7 +448,7 @@ void run()
<< num_times << csv_separator << num_times << csv_separator
<< ARRAY_SIZE << csv_separator << ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator << sizeof(T) << csv_separator
<< ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator << ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
<< *minmax.first << csv_separator << *minmax.first << csv_separator
<< *minmax.second << csv_separator << *minmax.second << csv_separator
<< average << average
@ -404,7 +459,7 @@ void run()
std::cout std::cout
<< std::left << std::setw(12) << labels[i] << std::left << std::setw(12) << labels[i]
<< std::left << std::setw(12) << std::setprecision(3) << << std::left << std::setw(12) << std::setprecision(3) <<
((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
<< std::left << std::setw(12) << std::setprecision(5) << average << std::left << std::setw(12) << std::setprecision(5) << average
@ -415,7 +470,7 @@ void run()
{ {
// Display timing results // Display timing results
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
if (output_as_csv) if (output_as_csv)
{ {
@ -461,7 +516,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
T goldA = startA; T goldA = startA;
T goldB = startB; T goldB = startB;
T goldC = startC; T goldC = startC;
T goldSum = 0.0; T goldSum{};
const T scalar = startScalar; const T scalar = startScalar;
@ -487,15 +542,15 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
goldSum = goldA * goldB * ARRAY_SIZE; goldSum = goldA * goldB * ARRAY_SIZE;
// Calculate the average error // Calculate the average error
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); });
errA /= a.size(); errA /= a.size();
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); });
errB /= b.size(); errB /= b.size();
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); });
errC /= c.size(); errC /= c.size();
double errSum = fabs((sum - goldSum)/goldSum); long double errSum = std::fabs((sum - goldSum)/goldSum);
double epsi = std::numeric_limits<T>::epsilon() * 100.0; long double epsi = std::numeric_limits<T>::epsilon() * 100.0;
if (errA > epsi) if (errA > epsi)
std::cerr std::cerr

View File

@ -260,7 +260,7 @@ T OCLStream<T>::dot()
); );
cl::copy(queue, d_sum, sums.begin(), sums.end()); cl::copy(queue, d_sum, sums.begin(), sums.end());
T sum = 0.0; T sum{};
for (T val : sums) for (T val : sums)
sum += val; sum += val;

View File

@ -220,7 +220,7 @@ void OMPStream<T>::nstream()
template <class T> template <class T>
T OMPStream<T>::dot() T OMPStream<T>::dot()
{ {
T sum = 0.0; T sum{};
#ifdef OMP_TARGET_GPU #ifdef OMP_TARGET_GPU
int array_size = this->array_size; int array_size = this->array_size;

View File

@ -131,7 +131,7 @@ T RAJAStream<T>::dot()
T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT b = d_b;
RAJA::ReduceSum<reduce_policy, T> sum(0.0); RAJA::ReduceSum<reduce_policy, T> sum(T{});
forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index) forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
{ {

View File

@ -1,25 +1,26 @@
register_flag_optional(CMAKE_CXX_COMPILER register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA. "Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++") "c++")
register_flag_required(RAJA_IN_TREE register_flag_optional(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA. "Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example: Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
") " "")
register_flag_optional(RAJA_IN_PACKAGE
"Use if Raja is part of a package dependency:
Path to installation" "")
register_flag_optional(TARGET register_flag_optional(TARGET
"Target offload device, implemented values are CPU, NVIDIA" "Target offload device, implemented values are CPU, NVIDIA"
CPU) CPU)
register_flag_optional(CUDA_TOOLKIT_ROOT_DIR register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
"[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "") "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the RAJA_ENABLE_CUDA or ENABLE_CUDA flag is specified for RAJA" "")
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
register_flag_optional(CUDA_ARCH register_flag_optional(CUDA_ARCH
@ -57,7 +58,20 @@ macro(setup)
set(ENABLE_BENCHMARKS OFF CACHE BOOL "") set(ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE) set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE)
if (ENABLE_CUDA) # RAJA >= v2022.03.0 switched to prefixed variables, we keep the legacy ones for backwards compatibiity
set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
set(RAJA_ENABLE_REPRODUCERS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
set(RAJA_ENABLE_DOCUMENTATION OFF CACHE BOOL "")
set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(RAJA_ENABLE_CUDA ${RAJA_ENABLE_CUDA} CACHE BOOL "" FORCE)
if (ENABLE_CUDA OR RAJA_ENABLE_CUDA)
# RAJA still needs ENABLE_CUDA for internal use, so if either is on, assert both.
set(RAJA_ENABLE_CUDA ON)
set(ENABLE_CUDA ON)
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
if(POLICY CMP0104) if(POLICY CMP0104)
@ -69,6 +83,10 @@ macro(setup)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
list(APPEND CMAKE_CUDA_FLAGS) list(APPEND CMAKE_CUDA_FLAGS)
# See https://github.com/LLNL/RAJA/pull/1302
# And https://github.com/LLNL/RAJA/pull/1339
set(RAJA_ENABLE_VECTORIZATION OFF CACHE BOOL "")
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}") message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
endif () endif ()
@ -76,8 +94,14 @@ macro(setup)
register_link_library(RAJA) register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here: # RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
elseif (EXISTS "${RAJA_IN_PACKAGE}")
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
find_package(RAJA REQUIRED)
register_link_library(RAJA)
else () else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
endif () endif ()

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,25 @@
[package] [package]
name = "rust-stream" name = "rust-stream"
version = "4.0.0" version = "5.0.0"
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"] authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
edition = "2018" edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
num-traits = "0.2.14" num-traits = "0.2.15"
structopt = "0.3.13" structopt = "0.3.26"
tabular = "0.1.4" tabular = "0.2.0"
rayon = "1.5.1" rayon = "1.5.3"
crossbeam = "0.8.1" crossbeam = "0.8.2"
num_cpus = "1.13.0" num_cpus = "1.13.1"
rustversion = "1.0" rustversion = "1.0.9"
libc = "0.2.97" libc = "0.2.134"
core_affinity = "0.5.10" core_affinity = "0.5.10"
colour = "0.6.0" colour = "0.6.0"
[dev-dependencies] [dev-dependencies]
rstest = "0.10.0" rstest = "0.13.0"
[build-dependencies] [build-dependencies]
rustversion = "1.0" rustversion = "1.0"

View File

@ -54,7 +54,7 @@ use_field_init_shorthand = false
force_explicit_abi = true force_explicit_abi = true
condense_wildcard_suffixes = false condense_wildcard_suffixes = false
color = "Auto" color = "Auto"
required_version = "1.4.38" required_version = "1.6.0"
unstable_features = false unstable_features = false
disable_all_formatting = false disable_all_formatting = false
skip_children = false skip_children = false

View File

@ -174,7 +174,7 @@ where StreamData<T, D, A>: RustStream<T> {
); );
} }
stream.init_arrays(); let init = stream.run_init_arrays();
let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> { let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
let tail = &xs[1..]; // tail only let tail = &xs[1..]; // tail only
@ -235,10 +235,47 @@ where StreamData<T, D, A>: RustStream<T> {
}; };
}; };
let show_setup = |init: Duration, read: Duration| {
let setup = vec![
("Init", init.as_secs_f64(), 3 * array_bytes),
("Read", read.as_secs_f64(), 3 * array_bytes),
];
if option.csv {
tabulate_all(
setup
.iter()
.map(|(name, elapsed, t_size)| {
vec![
("phase", name.to_string()),
("n_elements", option.arraysize.to_string()),
("sizeof", t_size.to_string()),
(
if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
(mega_scale * (*t_size as f64) / elapsed).to_string(),
),
("runtime", elapsed.to_string()),
]
})
.collect::<Vec<_>>(),
);
} else {
for (name, elapsed, t_size) in setup {
println!(
"{}: {:.5} s (={:.5} {})",
name,
elapsed,
mega_scale * (t_size as f64) / elapsed,
if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }
);
}
}
};
let solutions_correct = match benchmark { let solutions_correct = match benchmark {
Benchmark::All => { Benchmark::All => {
let (results, sum) = stream.run_all(option.numtimes); let (results, sum) = stream.run_all(option.numtimes);
stream.read_arrays(); let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum)); let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum));
tabulate_all(vec![ tabulate_all(vec![
tabulate(&results.copy, "Copy", 2 * array_bytes), tabulate(&results.copy, "Copy", 2 * array_bytes),
@ -251,14 +288,16 @@ where StreamData<T, D, A>: RustStream<T> {
} }
Benchmark::NStream => { Benchmark::NStream => {
let results = stream.run_nstream(option.numtimes); let results = stream.run_nstream(option.numtimes);
stream.read_arrays(); let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None); let correct = check_solution(benchmark, option.numtimes, &stream, None);
tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]); tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
correct correct
} }
Benchmark::Triad => { Benchmark::Triad => {
let results = stream.run_triad(option.numtimes); let results = stream.run_triad(option.numtimes);
stream.read_arrays(); let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None); let correct = check_solution(benchmark, option.numtimes, &stream, None);
let total_bytes = 3 * array_bytes * option.numtimes; let total_bytes = 3 * array_bytes * option.numtimes;
let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64()); let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());

View File

@ -132,6 +132,18 @@ pub trait RustStream<T: Default> {
fn nstream(&mut self); fn nstream(&mut self);
fn dot(&mut self) -> T; fn dot(&mut self) -> T;
fn run_init_arrays(&mut self) -> Duration {
timed(|| {
self.init_arrays();
})
}
fn run_read_arrays(&mut self) -> Duration {
timed(|| {
self.read_arrays();
})
}
fn run_all(&mut self, n: usize) -> (AllTiming<Vec<Duration>>, T) { fn run_all(&mut self, n: usize) -> (AllTiming<Vec<Duration>>, T) {
let mut timings: AllTiming<Vec<Duration>> = AllTiming { let mut timings: AllTiming<Vec<Duration>> = AllTiming {
copy: vec![Duration::default(); n], copy: vec![Duration::default(); n],

View File

@ -2,10 +2,10 @@ use rstest::rstest;
#[rstest] #[rstest]
fn test_main( fn test_main(
#[values(0, 1, 2, 3, 4)] device: usize, // #[values(0, 1, 2, 3, 4)] device: usize, //
#[values("", "--pin")] pin: &str, // #[values("", "--pin")] pin: &str, //
#[values("", "--malloc")] malloc: &str, // #[values("", "--malloc")] malloc: &str, //
#[values("", "--init")] init: &str, // #[values("", "--init")] init: &str, //
#[values("", "--triad-only", "--nstream-only")] option: &str, // #[values("", "--triad-only", "--nstream-only")] option: &str, //
) { ) {
let line = format!( let line = format!(

View File

@ -1 +0,0 @@
{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]}

View File

@ -1 +1,2 @@
target/ target/
.bsp/

View File

@ -1,4 +1,4 @@
version = "3.0.0-RC2" version = "3.7.14"
runner.dialect = scala3 runner.dialect = scala3
style = defaultWithAlign style = defaultWithAlign

View File

@ -3,14 +3,19 @@ lazy val mainCls = Some("scalastream.App")
lazy val root = (project in file(".")) lazy val root = (project in file("."))
.enablePlugins(NativeImagePlugin) .enablePlugins(NativeImagePlugin)
.settings( .settings(
scalaVersion := "3.0.0", scalaVersion := "3.3.1",
version := "4.0", version := "5.0",
organization := "uk.ac.bristol.uob-hpc", organization := "uk.ac.bristol.uob-hpc",
organizationName := "University of Bristol", organizationName := "University of Bristol",
Compile / mainClass := mainCls, Compile / mainClass := mainCls,
assembly / mainClass := mainCls, assembly / mainClass := mainCls,
scalacOptions ~= filterConsoleScalacOptions, scalacOptions ~= filterConsoleScalacOptions,
assembly / assemblyJarName := "scala-stream.jar", assembly / assemblyJarName := "scala-stream.jar",
assembly / assemblyMergeStrategy := {
case PathList("module-info.class") => MergeStrategy.discard
case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard
case x => (ThisBuild / assemblyMergeStrategy).value(x)
},
nativeImageOptions := Seq( nativeImageOptions := Seq(
"--no-fallback", "--no-fallback",
"-H:ReflectionConfigurationFiles=../../reflect-config.json" "-H:ReflectionConfigurationFiles=../../reflect-config.json"
@ -22,8 +27,8 @@ lazy val root = (project in file("."))
// Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part
("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13),
// par also uses lazy val at some point, so it doesn't work in nativeImage // par also uses lazy val at some point, so it doesn't work in nativeImage
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3", "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
"net.openhft" % "affinity" % "3.21ea1", "net.openhft" % "affinity" % "3.23.2",
"org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity "org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity
) )
) )

View File

@ -1 +1 @@
sbt.version=1.5.2 sbt.version=1.9.2

View File

@ -1,6 +1,6 @@
addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20")
addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3")

View File

@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def config: Config[A] def config: Config[A]
def initArrays(): Unit def initArrays(): Unit
def readArrays(): Unit = ()
def copy(): Unit def copy(): Unit
def mul(): Unit def mul(): Unit
def add(): Unit def add(): Unit
@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
val end = System.nanoTime() val end = System.nanoTime()
FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r
inline def runInitArrays(): FiniteDuration = timed(initArrays())._1
inline def runReadArrays(): FiniteDuration = timed(readArrays())._1
inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) = inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) =
val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def data(): Data[A] def data(): Data[A]
trait Fractional[@specialized(Double, Float) A]: trait Fractional[@specialized(Double, Float) A]:
def toFractional(f: Float): A def toFractional(f: Float): A
def toFractional(f: Double): A def toFractional(f: Double): A
@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]:
extension (x: Int) inline def fractional = toFractional(x.toFloat) extension (x: Int) inline def fractional = toFractional(x.toFloat)
extension (x: Long) inline def fractional = toFractional(x.toDouble) extension (x: Long) inline def fractional = toFractional(x.toDouble)
extension (x: A) extension (x: A)
inline def +(y: A) = add(x, y) inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y) inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y) inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y) inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0 inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0 inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x) inline def abs_ = abs(x)
end Fractional end Fractional
given FloatFractional: Fractional[Float] with given FloatFractional: Fractional[Float] with
@ -108,7 +110,7 @@ given DoubleFractional: Fractional[Double] with
object App: object App:
final val Version: String = "4.0" final val Version: String = "5.0"
case class Config[@specialized(Double, Float) A]( case class Config[@specialized(Double, Float) A](
options: Options, options: Options,
@ -204,7 +206,7 @@ object App:
validateXs("c", vec.c, goldC) validateXs("c", vec.c, goldC)
dotSum.foreach { sum => dotSum.foreach { sum =>
val goldSum = (goldA * goldB) * (config.options.arraysize).fractional val goldSum = (goldA * goldB) * config.options.arraysize.fractional
val error = ((sum - goldSum) / goldSum).abs_ val error = ((sum - goldSum) / goldSum).abs_
if error > 1.fractional / 100000000.fractional then if error > 1.fractional / 100000000.fractional then
Console.err.println( Console.err.println(
@ -238,10 +240,10 @@ object App:
) )
println(s"Running ${config.benchmark match { println(s"Running ${config.benchmark match {
case Benchmark.All => "kernels" case Benchmark.All => "kernels"
case Benchmark.Triad => "triad" case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream" case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times") }} ${opt.numtimes} times")
if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}") if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}")
@ -288,11 +290,38 @@ object App:
println(header.map(_._1.padTo(padding, ' ')).mkString(sep)) println(header.map(_._1.padTo(padding, ' ')).mkString(sep))
println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n")) println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n"))
def showInit(init: FiniteDuration, read: FiniteDuration): Unit = {
val setup =
Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes))
if opt.csv then
tabulate(
setup.map((name, elapsed, totalBytes) =>
Vector(
"phase" -> name,
"n_elements" -> opt.arraysize.toString,
"sizeof" -> arrayBytes.toString,
s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" ->
(megaScale * totalBytes.toDouble / elapsed).toString,
"runtime" -> elapsed.toString
)
): _*
)
else
for (name, elapsed, totalBytes) <- setup do
println(
f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${
if opt.mibibytes then "i" else ""
}Bytes/sec)"
)
}
val stream = mkStream(config) val stream = mkStream(config)
stream.initArrays() val init = stream.runInitArrays()
config.benchmark match config.benchmark match
case Benchmark.All => case Benchmark.All =>
val (results, sum) = stream.runAll(opt.numtimes) val (results, sum) = stream.runAll(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config, Some(sum)) validate(stream.data(), config, Some(sum))
tabulate( tabulate(
mkRow(results.copy, "Copy", 2 * arrayBytes), mkRow(results.copy, "Copy", 2 * arrayBytes),
@ -303,10 +332,14 @@ object App:
) )
case Benchmark.NStream => case Benchmark.NStream =>
val result = stream.runNStream(opt.numtimes) val result = stream.runNStream(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config) validate(stream.data(), config)
tabulate(mkRow(result, "Nstream", 4 * arrayBytes)) tabulate(mkRow(result, "Nstream", 4 * arrayBytes))
case Benchmark.Triad => case Benchmark.Triad =>
val results = stream.runTriad(opt.numtimes) val results = stream.runTriad(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
val totalBytes = 3 * arrayBytes * opt.numtimes val totalBytes = 3 * arrayBytes * opt.numtimes
val bandwidth = megaScale * (totalBytes / results.seconds) val bandwidth = megaScale * (totalBytes / results.seconds)
println(f"Runtime (seconds): ${results.seconds}%.5f") println(f"Runtime (seconds): ${results.seconds}%.5f")

View File

@ -6,64 +6,76 @@
#include "STDDataStream.h" #include "STDDataStream.h"
#include <algorithm>
#include <execution>
#include <numeric>
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
template <class T> template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device) STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) noexcept : array_size{ARRAY_SIZE},
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{ {
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDDataStream<T>::~STDDataStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
} }
template <class T> template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC) void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::fill(exe_policy, a.begin(), a.end(), initA); std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b.begin(), b.end(), initB); std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c.begin(), c.end(), initC); std::fill(exe_policy, c, c + array_size, initC);
} }
template <class T> template <class T>
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
h_a = a; std::copy(a, a + array_size, h_a.begin());
h_b = b; std::copy(b, b + array_size, h_b.begin());
h_c = c; std::copy(c, c + array_size, h_c.begin());
} }
template <class T> template <class T>
void STDDataStream<T>::copy() void STDDataStream<T>::copy()
{ {
// c[i] = a[i] // c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin()); std::copy(exe_policy, a, a + array_size, c);
} }
template <class T> template <class T>
void STDDataStream<T>::mul() void STDDataStream<T>::mul()
{ {
// b[i] = scalar * c[i]; // b[i] = scalar * c[i];
std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
} }
template <class T> template <class T>
void STDDataStream<T>::add() void STDDataStream<T>::add()
{ {
// c[i] = a[i] + b[i]; // c[i] = a[i] + b[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>()); std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
} }
template <class T> template <class T>
void STDDataStream<T>::triad() void STDDataStream<T>::triad()
{ {
// a[i] = b[i] + scalar * c[i]; // a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
} }
template <class T> template <class T>
@ -73,8 +85,8 @@ void STDDataStream<T>::nstream()
// Need to do in two stages with C++11 STL. // Need to do in two stages with C++11 STL.
// 1: a[i] += b[i] // 1: a[i] += b[i]
// 2: a[i] += scalar * c[i]; // 2: a[i] += scalar * c[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
} }
@ -82,7 +94,7 @@ template <class T>
T STDDataStream<T>::dot() T STDDataStream<T>::dot()
{ {
// sum = 0; sum += a[i]*b[i]; return sum; // sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
} }
void listDevices(void) void listDevices(void)
@ -101,4 +113,3 @@ std::string getDeviceDriver(const int)
} }
template class STDDataStream<float>; template class STDDataStream<float>;
template class STDDataStream<double>; template class STDDataStream<double>;

View File

@ -5,6 +5,7 @@
// source code // source code
#pragma once #pragma once
#include "dpl_shim.h"
#include <iostream> #include <iostream>
#include <stdexcept> #include <stdexcept>
@ -21,14 +22,11 @@ class STDDataStream : public Stream<T>
int array_size; int array_size;
// Device side pointers // Device side pointers
std::vector<T> a; T *a, *b, *c;
std::vector<T> b;
std::vector<T> c;
public: public:
STDDataStream(const int, int) noexcept; STDDataStream(const int, int) noexcept;
~STDDataStream() = default; ~STDDataStream();
virtual void copy() override; virtual void copy() override;
virtual void add() override; virtual void add() override;

View File

@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
ccall - Compile for all supported compute capabilities" ccall - Compile for all supported compute capabilities"
"") "")
register_flag_optional(USE_TBB
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
DPCPP - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup) macro(setup)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
if (NVHPC_OFFLOAD) if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well # propagate flags to linker so that it links with the gpu stuff as well
register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS})
endif () endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()
if (USE_ONEDPL)
register_definitions(USE_ONEDPL)
register_link_library(oneDPL)
endif ()
endmacro() endmacro()

View File

@ -6,50 +6,66 @@
#include "STDIndicesStream.h" #include "STDIndicesStream.h"
#include <algorithm> #ifndef ALIGNMENT
#include <execution> #define ALIGNMENT (2*1024*1024) // 2MB
#include <numeric> #endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
template <class T> template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device) STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{ {
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDIndicesStream<T>::~STDIndicesStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
} }
template <class T> template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC) void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::fill(exe_policy, a.begin(), a.end(), initA); std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b.begin(), b.end(), initB); std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c.begin(), c.end(), initC); std::fill(exe_policy, c, c + array_size, initC);
} }
template <class T> template <class T>
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
h_a = a; std::copy(a, a + array_size, h_a.begin());
h_b = b; std::copy(b, b + array_size, h_b.begin());
h_c = c; std::copy(c, c + array_size, h_c.begin());
} }
template <class T> template <class T>
void STDIndicesStream<T>::copy() void STDIndicesStream<T>::copy()
{ {
// c[i] = a[i] // c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin()); std::copy(exe_policy, a, a + array_size, c);
} }
template <class T> template <class T>
void STDIndicesStream<T>::mul() void STDIndicesStream<T>::mul()
{ {
// b[i] = scalar * c[i]; // b[i] = scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
return scalar * c[i]; return scalar * c[i];
}); });
} }
@ -58,7 +74,7 @@ template <class T>
void STDIndicesStream<T>::add() void STDIndicesStream<T>::add()
{ {
// c[i] = a[i] + b[i]; // c[i] = a[i] + b[i];
std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
return a[i] + b[i]; return a[i] + b[i];
}); });
} }
@ -67,7 +83,7 @@ template <class T>
void STDIndicesStream<T>::triad() void STDIndicesStream<T>::triad()
{ {
// a[i] = b[i] + scalar * c[i]; // a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
return b[i] + scalar * c[i]; return b[i] + scalar * c[i];
}); });
} }
@ -79,7 +95,7 @@ void STDIndicesStream<T>::nstream()
// Need to do in two stages with C++11 STL. // Need to do in two stages with C++11 STL.
// 1: a[i] += b[i] // 1: a[i] += b[i]
// 2: a[i] += scalar * c[i]; // 2: a[i] += scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
return a[i] + b[i] + scalar * c[i]; return a[i] + b[i] + scalar * c[i];
}); });
} }
@ -89,7 +105,7 @@ template <class T>
T STDIndicesStream<T>::dot() T STDIndicesStream<T>::dot()
{ {
// sum = 0; sum += a[i]*b[i]; return sum; // sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
} }
void listDevices(void) void listDevices(void)
@ -108,4 +124,3 @@ std::string getDeviceDriver(const int)
} }
template class STDIndicesStream<float>; template class STDIndicesStream<float>;
template class STDIndicesStream<double>; template class STDIndicesStream<double>;

View File

@ -5,6 +5,7 @@
// source code // source code
#pragma once #pragma once
#include "dpl_shim.h"
#include <iostream> #include <iostream>
#include <stdexcept> #include <stdexcept>
@ -12,40 +13,57 @@
#define IMPLEMENTATION_STRING "STD (index-oriented)" #define IMPLEMENTATION_STRING "STD (index-oriented)"
// A lightweight counting iterator which will be used by the STL algorithms // A lightweight counting iterator which will be used by the STL algorithms
// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
// implementation doesn't target // implementation doesn't target
template <typename N> template <typename N>
class ranged { class ranged {
N from, to;
public: public:
ranged(N from, N to ): from(from), to(to) {} class iterator {
class iterator { friend class ranged;
N num;
public: public:
using difference_type = N; using difference_type = N;
using value_type = N; using value_type = N;
using pointer = const N*; using pointer = const N*;
using reference = const N&; using reference = N;
using iterator_category = std::random_access_iterator_tag; using iterator_category = std::random_access_iterator_tag;
explicit iterator(N _num = 0) : num(_num) {}
iterator& operator++() { num++; return *this; } // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
iterator operator+(const value_type v) const { return iterator(num + v); } // This type is unused for any nother STL impl.
using is_passed_directly = std::true_type;
bool operator==(iterator other) const { return num == other.num; } reference operator *() const { return i_; }
bool operator!=(iterator other) const { return *this != other; } iterator &operator ++() { ++i_; return *this; }
bool operator<(iterator other) const { return num < other.num; } iterator operator ++(int) { iterator copy(*this); ++i_; return copy; }
reference operator*() const { return num;} iterator &operator --() { --i_; return *this; }
difference_type operator-(const iterator &it) const { return num - it.num; } iterator operator --(int) { iterator copy(*this); --i_; return copy; }
value_type operator[](const difference_type &i) const { return num + i; }
}; iterator &operator +=(N by) { i_+=by; return *this; }
iterator begin() { return iterator(from); }
iterator end() { return iterator(to >= from? to+1 : to-1); } value_type operator[](const difference_type &i) const { return i_ + i; }
difference_type operator-(const iterator &it) const { return i_ - it.i_; }
iterator operator+(const value_type v) const { return iterator(i_ + v); }
bool operator ==(const iterator &other) const { return i_ == other.i_; }
bool operator !=(const iterator &other) const { return i_ != other.i_; }
bool operator < (const iterator &other) const { return i_ < other.i_; }
protected:
explicit iterator(N start) : i_ (start) {}
private:
N i_;
};
[[nodiscard]] iterator begin() const { return begin_; }
[[nodiscard]] iterator end() const { return end_; }
ranged(N begin, N end) : begin_(begin), end_(end) {}
private:
iterator begin_;
iterator end_;
}; };
template <class T> template <class T>
@ -59,14 +77,11 @@ class STDIndicesStream : public Stream<T>
ranged<int> range; ranged<int> range;
// Device side pointers // Device side pointers
std::vector<T> a; T *a, *b, *c;
std::vector<T> b;
std::vector<T> c;
public: public:
STDIndicesStream(const int, int) noexcept; STDIndicesStream(const int, int) noexcept;
~STDIndicesStream() = default; ~STDIndicesStream();
virtual void copy() override; virtual void copy() override;
virtual void add() override; virtual void add() override;

View File

@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
ccall - Compile for all supported compute capabilities" ccall - Compile for all supported compute capabilities"
"") "")
register_flag_optional(USE_TBB
"Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
DPCPP - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup) macro(setup)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
if (NVHPC_OFFLOAD) if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well # propagate flags to linker so that it links with the gpu stuff as well
register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS})
endif () endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()
if (USE_ONEDPL)
register_definitions(USE_ONEDPL)
register_link_library(oneDPL)
endif ()
endmacro() endmacro()

View File

@ -5,25 +5,45 @@
// source code // source code
#include "STDRangesStream.hpp" #include "STDRangesStream.hpp"
#include <algorithm>
#include <execution>
#include <ranges> #include <ranges>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T> template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device) STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE} noexcept : array_size{ARRAY_SIZE},
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{ {
a = std::vector<T>(array_size); std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
b = std::vector<T>(array_size); #ifdef USE_ONEDPL
c = std::vector<T>(array_size); std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDRangesStream<T>::~STDRangesStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
} }
template <class T> template <class T>
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC) void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, // loop range std::views::iota(0).begin(), array_size, // loop range
[&] (int i) { [&] (int i) {
a[i] = initA; a[i] = initA;
@ -37,16 +57,16 @@ template <class T>
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
// Element-wise copy. // Element-wise copy.
h_a = a; std::copy(a, a + array_size, h_a.begin());
h_b = b; std::copy(b, b + array_size, h_b.begin());
h_c = c; std::copy(c, c + array_size, h_c.begin());
} }
template <class T> template <class T>
void STDRangesStream<T>::copy() void STDRangesStream<T>::copy()
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
c[i] = a[i]; c[i] = a[i];
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
b[i] = scalar * c[i]; b[i] = scalar * c[i];
@ -72,7 +92,7 @@ template <class T>
void STDRangesStream<T>::add() void STDRangesStream<T>::add()
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
a[i] = b[i] + scalar * c[i]; a[i] = b[i] + scalar * c[i];
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
a[i] += b[i] + scalar * c[i]; a[i] += b[i] + scalar * c[i];
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
// sum += a[i] * b[i]; // sum += a[i] * b[i];
return return
std::transform_reduce( std::transform_reduce(
std::execution::par_unseq, exe_policy,
a.begin(), a.end(), b.begin(), 0.0); a, a + array_size, b, T{});
} }
void listDevices(void) void listDevices(void)
@ -135,4 +155,3 @@ std::string getDeviceDriver(const int)
template class STDRangesStream<float>; template class STDRangesStream<float>;
template class STDRangesStream<double>; template class STDRangesStream<double>;

Some files were not shown because too many files have changed in this diff Show More