Merge pull request #169 from UoB-HPC/develop

Merge develop for v5.0
This commit is contained in:
Tom Deakin 2023-10-12 11:11:33 +01:00 committed by GitHub
commit f3801aeac2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
115 changed files with 7984 additions and 1855 deletions

View File

@ -12,12 +12,12 @@ on:
jobs:
test-rust:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/rust/rust-stream
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Setup project
run: rustup install nightly
- name: Compile project
@ -28,12 +28,12 @@ jobs:
run: ./target/release/rust-stream --arraysize 2048
test-java:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/java/java-stream
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Test build project
run: ./mvnw clean package
- name: Test run
@ -41,12 +41,12 @@ jobs:
run: java -jar target/java-stream.jar --arraysize 2048
test-julia:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/julia/JuliaStream.jl
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Setup project
run: julia --project -e 'import Pkg; Pkg.instantiate()'
- name: Test run PlainStream.jl
@ -70,14 +70,22 @@ jobs:
test-cpp:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v2
- name: Maximize build space
uses: easimon/maximize-build-space@v8
with:
root-reserve-mb: 8192
swap-size-mb: 512
remove-android: 'true'
remove-codeql: 'true'
- uses: actions/checkout@v4
- name: Cache compiler
if: ${{ !env.ACT }}
id: prepare-compilers
uses: actions/cache@v2
uses: actions/cache@v3
with:
path: ./compilers
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
@ -90,9 +98,9 @@ jobs:
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
# Enable tmate debugging of manually-triggered workflows if the input option was provided
- name: Setup tmate session
uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3
# if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
- name: Test compile gcc @ CMake 3.13
if: ${{ ! cancelled() }}
@ -167,4 +175,65 @@ jobs:
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile hipsycl @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile gcc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile clang @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile nvhpc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aocc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aomp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hip @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile dpcpp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hipsycl @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile gcc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile clang @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile nvhpc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aocc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aomp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hip @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile dpcpp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hipsycl @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}
test-futhark:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Prepare Futhark compiler
uses: diku-dk/install-futhark@HEAD
with:
version: 'latest'
- run: cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=multicore
- run: cmake --build build

6
.gitignore vendored
View File

@ -10,12 +10,18 @@ sycl-stream
hip-stream
tbb-stream
src/fortran/BabelStream
src/fortran/BabelStream.*
*.o
*.bc
*.sycl
*.tar
*.gz
*.a
*.mod
*.cub
*.ptx
KokkosCore_config.*

View File

@ -2,8 +2,32 @@
All notable changes to this project will be documented in this file.
## Unreleased
### Added
- Ability to build Kokkos and RAJA versions against existing packages.
- Thrust managed memory.
- HIP managed memory.
- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`.
- New implementation in Fortran
- New implementation in [Futhark](https://futhark-lang.org/)
- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust
- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java
- JuliaStream.jl published to registry (pending #113)
### Changed
- Fix std-data/std-indices compatibility with oneDPL, NVHPC, and AdaptiveCpp (a.k.a. hipSYCL).
- RAJA CUDA CMake build issues resolved.
- Kokkos build updates (CXX version upgraded to C++17).
- Fix CUDA memory limit check.
- Fix CUDA CMake options for `-DMEM` and `-DCMAKE_CUDA_FLAGS`.
- Use long double for `check_solution` in case of large problem size.
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
- Updates to the HIP kernels and API usage.
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with AdaptiveCpp.
- Bumped Julia compat to 1.9
- Bumped Scala to 3.3.1
- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23)
- Upgrade CI to Ubuntu 22.04
## [v4.0] - 2021-12-22

View File

@ -1,6 +1,10 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(BabelStream VERSION 4.0 LANGUAGES CXX)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()
project(BabelStream VERSION 5.0 LANGUAGES CXX C)
# uncomment for debugging build issues:
#set(CMAKE_VERBOSE_MAKEFILE ON)
@ -27,8 +31,6 @@ endmacro()
# the final executable name
set(EXE_NAME babelstream)
# select default build type
set(CMAKE_BUILD_TYPE "Release")
# for chrono and some basic CXX features, models can overwrite this if required
set(CMAKE_CXX_STANDARD 11)
@ -71,6 +73,75 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
# Honor user's CXX_EXTRA_LINK_FLAGS
set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
don't explicitly link against TBB is a no-op, see description of your selected
model on how this is used." OFF)
option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
if (FETCH_TBB)
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
GIT_TAG "${FETCH_TBB_VERSION}"
)
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
set(TBB_STRICT OFF)
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(TBB)
if (NOT TBB_POPULATED)
FetchContent_Populate(TBB)
add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL)
endif ()
endif ()
option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
don't explicitly link against DPL is a no-op, see description of your selected
model on how this is used." OFF)
option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
if (FETCH_ONEDPL)
FetchContent_Declare(
oneDPL
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
GIT_TAG "${FETCH_ONEDPL_VERSION}"
)
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
if (ONEDPL_BACKEND STREQUAL "openmp")
set(ONEDPL_BACKEND omp)
endif ()
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(oneDPL)
if (NOT oneDPL_POPULATED)
FetchContent_Populate(oneDPL)
if (USE_TBB)
macro(find_package NAME)
if ("${NAME}" STREQUAL "TBB")
message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})")
else ()
_find_package(${NAME} ${ARGN})
endif ()
endmacro()
endif ()
add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL)
# Fixup oneDPL's omission on setting DPCPP definitions.
# We do this after the creation of the oneDPL target.
if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$")
target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1)
endif ()
endif ()
endif ()
# include our macros
include(cmake/register_models.cmake)
@ -84,12 +155,14 @@ register_model(hip HIP HIPStream.cpp)
register_model(cuda CUDA CUDAStream.cu)
register_model(kokkos KOKKOS KokkosStream.cpp)
register_model(sycl SYCL SYCLStream.cpp)
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(raja USE_RAJA RAJAStream.cpp)
register_model(tbb TBB TBBStream.cpp)
register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
register_model(futhark FUTHARK FutharkStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -101,6 +174,12 @@ else ()
message(STATUS "Selected model : ${MODEL}")
endif ()
if (MODEL STREQUAL "sycl2020")
message(FATAL_ERROR "
Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
endif ()
# load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL
load_model(${MODEL})
@ -151,6 +230,7 @@ include_directories(src)
add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES})
if (CXX_EXTRA_LIBRARIES)
target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})

View File

@ -38,9 +38,10 @@ BabelStream is currently implemented in the following parallel programming model
- C++ Parallel STL
- Kokkos
- RAJA
- SYCL and SYCL 2020
- SYCL and SYCL2020 (USM and accessors)
- TBB
- Thrust (via CUDA or HIP)
- Futhark
This project also contains implementations in alternative languages with different build systems:
* Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl)
@ -101,7 +102,7 @@ The source for each model's implementations are located in `./src/<model>`.
Currently available models are:
```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust;futhark
```
#### Overriding default flags
@ -165,7 +166,7 @@ The `MODEL` variant selects one implementation of BabelStream to build.
Currently available models are:
```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust
```
### GNU Make

4
src/.gitignore vendored
View File

@ -16,6 +16,8 @@
**/*.gz
**/*.a
**/*.swp
**/KokkosCore_Config_*
**/.DS_Store
@ -26,4 +28,4 @@ cmake-build-*/
CMakeFiles/
.idea/
.vscode/
.directory
.directory

View File

@ -149,7 +149,7 @@ void ACCStream<T>::nstream()
template <class T>
T ACCStream<T>::dot()
{
T sum = 0.0;
T sum{};
int array_size = this->array_size;
T * restrict a = this->a;

View File

@ -83,6 +83,8 @@ get() {
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
else
echo "$name found, skipping download..."
fi
fi
}
@ -92,13 +94,15 @@ get_and_untar() {
local pkg_url="$2"
if [ "$SETUP" = true ]; then
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..."
echo "$name not found, downloading ($pkg_url)..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
fi
echo "Preparing to extract $name ..."
tar -xf "$name"
echo "$name extracted, deleting archive ..."
rm -f "$name" # delete for space
else
echo "Skipping setup for $name ($pkg_url)..."
fi
}
@ -119,10 +123,10 @@ verify_dir_exists() {
setup_aocc() {
echo "Preparing AOCC"
local aocc_ver="2.3.0"
local aocc_ver="4.0.0"
local tarball="aocc-$aocc_ver.tar.xz"
# XXX it's actually XZ compressed, so it should be tar.xz
local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
# local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
get_and_untar "$tarball" "$AOCC_URL"
@ -134,20 +138,26 @@ setup_aocc() {
setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
local nvhpc_release="2023_231"
local cuda_ver="12.0"
local tarball="nvhpc_$nvhpc_ver.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz"
get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x
"$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
export_var NVHPC_NVCC "$bin_dir/nvcc"
export_var NVHPC_CUDA_VER "$cuda_ver"
# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc"
echo "Installed CUDA versions:"
ls "$sdk_dir/cuda"
@ -160,7 +170,8 @@ setup_nvhpc() {
setup_aomp() {
echo "Preparing AOMP"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
local aomp_ver="18.0-0"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
# local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
get_and_install_deb "aomp" "aomp" "$AOMP_URL"
@ -183,9 +194,10 @@ setup_oclcpu() {
setup_kokkos() {
echo "Preparing Kokkos"
local kokkos_ver="3.3.01"
local kokkos_ver="4.1.00"
local tarball="kokkos-$kokkos_ver.tar.gz"
local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
# local url="http://localhost:8000/$kokkos_ver.tar.gz"
@ -197,10 +209,10 @@ setup_kokkos() {
setup_raja() {
echo "Preparing RAJA"
local raja_ver="0.13.0"
local raja_ver="2023.06.1"
local tarball="raja-$raja_ver.tar.gz"
local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
# local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
get_and_untar "$tarball" "$url"
@ -211,7 +223,7 @@ setup_raja() {
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tbb_ver="2021.9.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
@ -225,9 +237,9 @@ setup_tbb() {
setup_clang_gcc() {
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
export_var GCC_CXX "$(which g++-10)"
export_var GCC_CXX "$(which g++-12)"
verify_bin_exists "$GCC_CXX"
"$GCC_CXX" --version
@ -248,7 +260,11 @@ setup_clang_gcc() {
}
setup_rocm() {
sudo apt-get install -y -qq rocm-dev rocthrust-dev
if [ "$SETUP" = true ]; then
sudo apt-get install -y rocm-dev rocthrust-dev
else
echo "Skipping apt setup for ROCm"
fi
export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
@ -259,7 +275,7 @@ setup_rocm() {
setup_dpcpp() {
local nightly="20210106"
local nightly="20230615"
local tarball="dpcpp-$nightly.tar.gz"
local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
@ -276,22 +292,22 @@ setup_dpcpp() {
setup_hipsycl() {
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
local hipsycl_ver="0.9.0"
local hipsycl_ver="0.9.1"
local tarball="v$hipsycl_ver.tar.gz"
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
get_and_untar "$tarball" "$url"
if [ "$SETUP" = true ]; then
local src="$PWD/hipSYCL-$hipsycl_ver"
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
rm -rf "$src/build"
rm -rf "$install_dir"
cmake "-B$src/build" "-H$src" \
-DCMAKE_C_COMPILER="$(which gcc-10)" \
-DCMAKE_CXX_COMPILER="$(which g++-10)" \
-DCMAKE_C_COMPILER="$(which gcc-12)" \
-DCMAKE_CXX_COMPILER="$(which g++-12)" \
-DCMAKE_INSTALL_PREFIX="$install_dir" \
-DWITH_ROCM_BACKEND=OFF \
-DWITH_CUDA_BACKEND=OFF \
@ -306,25 +322,20 @@ setup_hipsycl() {
check_size
}
setup_computecpp() {
echo "TODO ComputeCpp requires registration+login to download"
}
if [ "${GITHUB_ACTIONS:-false}" = true ]; then
echo "Running in GitHub Actions, defaulting to special export"
TERM=xterm
export TERM=xterm
# drop the lock in case we got one from a failed run
rm /var/lib/dpkg/lock-frontend || true
rm /var/cache/apt/archives/lock || true
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
rm -rf /var/lib/dpkg/lock-frontend || true
rm -rf /var/cache/apt/archives/lock || true
mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt-get update -qq
sudo apt-get install -y -qq cmake
sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
if [ "$SETUP" = true ]; then
echo "Deleting extra packages for space in 2 seconds..."
@ -334,6 +345,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
sudo apt-get autoremove -y
check_size
fi
sudo apt-get upgrade -qq
else
echo "Running locally, defaulting to standard export"
fi
@ -362,6 +374,18 @@ setup_cmake() {
verify_bin_exists "$CMAKE_3_18_BIN"
"$CMAKE_3_18_BIN" --version
get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_20_BIN"
"$CMAKE_3_20_BIN" --version
get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_24_BIN"
"$CMAKE_3_24_BIN" --version
check_size
}
@ -379,6 +403,10 @@ if [ "$PARALLEL" = true ]; then
setup_tbb &
wait
else
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
setup_cmake
setup_aocc
setup_oclcpu
@ -388,10 +416,6 @@ else
setup_kokkos
setup_raja
setup_tbb
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
fi
echo "Done!"

View File

@ -120,9 +120,20 @@ run_build() {
# CLANG_OMP_OFFLOAD_NVIDIA=false
###
NV_ARCH_CC="70"
AMD_ARCH="gfx_903"
NV_ARCH="sm_70"
NV_ARCH_CCXY="cuda11.4,cc80"
NV_ARCH="sm_${NV_ARCH_CC}"
NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
check_cmake_ver(){
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required=$1
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
return 0
else
return 1
fi
}
build_gcc() {
local name="gcc_build"
@ -135,49 +146,61 @@ build_gcc() {
"./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10
fi
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
for use_onedpl in OFF OPENMP TBB; do
case "$use_onedpl" in
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;;
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
done
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
fi
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
if check_cmake_ver "3.16.0"; then
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
if check_cmake_ver "3.20.0"; then
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
if check_cmake_ver "3.20.0"; then
run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-DENABLE_CUDA=ON \
-DTARGET=NVIDIA \
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-DCUDA_ARCH=$NV_ARCH"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
# -DENABLE_CUDA=ON \
# -DTARGET=NVIDIA \
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
@ -187,9 +210,9 @@ build_gcc() {
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
echo "Skipping Thrust models due to CMake version requirement"
fi
}
@ -207,28 +230,39 @@ build_clang() {
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
if check_cmake_ver "3.16.0"; then
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
for use_onedpl in OFF OPENMP TBB; do
case "$use_onedpl" in
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
done
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
}
@ -237,6 +271,7 @@ build_nvhpc() {
local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
}
@ -254,6 +289,8 @@ build_hip() {
local name="hip_build"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT"
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
}
@ -275,15 +312,18 @@ build_icpc() {
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
run_build $name "${ICPC_CXX:?}" omp "$cxx"
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
}
if check_cmake_ver "3.20.0"; then
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
if check_cmake_ver "3.16.0"; then
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
build_computecpp() {
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
-DSYCL_COMPILER=COMPUTECPP \
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
-DOpenCL_LIBRARY=${OCL_LIB:?}"
}
build_dpcpp() {

View File

@ -42,41 +42,57 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
// Print out device information
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif
array_size = ARRAY_SIZE;
// Query device for sensible dot kernel block count
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_index);
check_error();
dot_num_blocks = props.multiProcessorCount * 4;
// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
sums = (T*)malloc(sizeof(T) * dot_num_blocks);
size_t array_bytes = sizeof(T);
array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 4;
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
// Check buffers fit on the device
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
if (props.totalGlobalMem < total_bytes)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers
#if defined(MANAGED)
cudaMallocManaged(&d_a, ARRAY_SIZE*sizeof(T));
cudaMallocManaged(&d_a, array_bytes);
check_error();
cudaMallocManaged(&d_b, ARRAY_SIZE*sizeof(T));
cudaMallocManaged(&d_b, array_bytes);
check_error();
cudaMallocManaged(&d_c, ARRAY_SIZE*sizeof(T));
cudaMallocManaged(&d_c, array_bytes);
check_error();
cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T));
check_error();
#elif defined(PAGEFAULT)
d_a = (T*)malloc(sizeof(T)*ARRAY_SIZE);
d_b = (T*)malloc(sizeof(T)*ARRAY_SIZE);
d_c = (T*)malloc(sizeof(T)*ARRAY_SIZE);
d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS);
d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(array_bytes);
d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
#else
cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
cudaMalloc(&d_a, array_bytes);
check_error();
cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
cudaMalloc(&d_b, array_bytes);
check_error();
cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
cudaMalloc(&d_c, array_bytes);
check_error();
cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
cudaMalloc(&d_sum, dot_num_blocks*sizeof(T));
check_error();
#endif
}
@ -237,7 +253,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
const size_t local_i = threadIdx.x;
tb_sum[local_i] = 0.0;
tb_sum[local_i] = {};
for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i];
@ -257,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
template <class T>
T CUDAStream<T>::dot()
{
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
dot_kernel<<<dot_num_blocks, TBSIZE>>>(d_a, d_b, d_sum, array_size);
check_error();
#if defined(MANAGED) || defined(PAGEFAULT)
cudaDeviceSynchronize();
check_error();
#else
cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost);
cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
#endif
T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
for (int i = 0; i < dot_num_blocks; i++)
{
#if defined(MANAGED) || defined(PAGEFAULT)
sum += d_sum[i];

View File

@ -13,16 +13,9 @@
#include "Stream.h"
#if defined(PAGEFAULT)
#define IMPLEMENTATION_STRING "CUDA - Page Fault"
#elif defined(MANAGED)
#define IMPLEMENTATION_STRING "CUDA - Managed Memory"
#else
#define IMPLEMENTATION_STRING "CUDA"
#endif
#define IMPLEMENTATION_STRING "CUDA"
#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
template <class T>
class CUDAStream : public Stream<T>
@ -40,6 +33,8 @@ class CUDAStream : public Stream<T>
T *d_c;
T *d_sum;
// Number of blocks for dot kernel
int dot_num_blocks;
public:

View File

@ -29,10 +29,11 @@ macro(setup)
endif()
enable_language(CUDA)
register_definitions(MEM=${MEM})
register_definitions(${MEM})
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later

76
src/dpl_shim.h Normal file
View File

@ -0,0 +1,76 @@
#pragma once
#include <cstdlib>
#include <cstddef>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_ONEDPL
// oneDPL C++17 PSTL
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/numeric>
#if ONEDPL_USE_DPCPP_BACKEND
#include <CL/sycl.hpp>
const static auto exe_policy = oneapi::dpl::execution::device_policy<>{
oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})
};
template<typename T>
T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue()); }
template<typename T>
void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
#else
// auto exe_policy = dpl::execution::seq;
// auto exe_policy = dpl::execution::par;
static constexpr auto exe_policy = dpl::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#else
// Normal C++17 PSTL
#include <algorithm>
#include <execution>
#include <numeric>
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
static constexpr auto exe_policy = std::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#ifdef USE_STD_PTR_ALLOC_DEALLOC
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
#include <CL/sycl.hpp>
// TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete
// for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled
static cl::sycl::queue queue{cl::sycl::default_selector_v};
template <typename T> T *alloc_raw(size_t size) { return cl::sycl::malloc_shared<T>(size, queue); }
template <typename T> void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); }
#else
template<typename T>
T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); }
template<typename T>
void dealloc_raw(T *ptr) { free(ptr); }
#endif
#endif

105
src/fortran/ArrayStream.F90 Normal file
View File

@ -0,0 +1,105 @@
module ArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=5), parameter :: implementation_name = "Array"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
A = initA
B = initB
C = initC
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
h_A = A
h_B = B
h_C = C
end subroutine read_arrays
subroutine copy()
implicit none
C = A
end subroutine copy
subroutine add()
implicit none
C = A + B
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
B = scalar * C
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = B + scalar * C
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = A + B + scalar * C
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
s = dot_product(A,B)
end function dot
end module ArrayStream

View File

@ -0,0 +1,21 @@
module BabelStreamTypes
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
implicit none
#ifdef USE_FLOAT
integer, parameter :: StreamRealKind = REAL32
character(len=6) :: StreamRealName = "REAL32"
#else
integer, parameter :: StreamRealKind = REAL64
character(len=6) :: StreamRealName = "REAL64"
#endif
#ifdef USE_INT32
#warning There is no checking for overflowing INT32, so be careful.
integer, parameter :: StreamIntKind = INT32
#else
integer, parameter :: StreamIntKind = INT64
#endif
end module BabelStreamTypes

View File

@ -0,0 +1,230 @@
module CUDAKernelStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "CUDAKernel"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i) + B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
B(i) = scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer(kind=StreamIntKind) :: i
integer :: err
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAKernelStream

309
src/fortran/CUDAStream.F90 Normal file
View File

@ -0,0 +1,309 @@
module CUDAFortranKernels
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
contains
attributes(global) subroutine do_copy(n,A,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i)
endif
end subroutine do_copy
attributes(global) subroutine do_add(n,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i) + B(i)
endif
end subroutine do_add
attributes(global) subroutine do_mul(n,scalar,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: B(n)
real(kind=REAL64), intent(in) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
B(i) = scalar * C(i)
endif
end subroutine do_mul
attributes(global) subroutine do_triad(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = B(i) + scalar * C(i)
endif
end subroutine do_triad
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(inout) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = A(i) + B(i) + scalar * C(i)
endif
end subroutine do_nstream
#if 0
attributes(global) subroutine do_dot(n,A,B,r)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: r
integer(kind=StreamIntKind) :: i
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
end subroutine do_dot
#endif
end module CUDAFortranKernels
module CUDAStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
use cudafor, only: dim3
implicit none
character(len=4), parameter :: implementation_name = "CUDA"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
type(dim3) :: grid, tblock
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
! move to separate subroutine later
tblock = dim3(128,1,1)
grid = dim3(ceiling(real(N)/tblock%x),1,1)
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use CUDAFortranKernels, only: do_copy
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_copy<<<grid, tblock>>>(N, A, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use CUDAFortranKernels, only: do_add
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_add<<<grid, tblock>>>(N, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use CUDAFortranKernels, only: do_mul
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use CUDAFortranKernels, only: do_triad
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use CUDAFortranKernels, only: do_nstream
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
!use CUDAFortranKernels, only: do_dot
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer :: err
integer(kind=StreamIntKind) :: i
!call do_dot<<<grid, tblock>>>(N, B, C, r)
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAStream

View File

@ -0,0 +1,139 @@
module DoConcurrentStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "DoConcurrent"
integer(kind=StreamIntKind) :: N
#ifdef USE_DEVICE
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N)
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,C)
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(B,C)
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
! reduction omitted because NVF infers it and other compilers do not support
s = real(0,kind=REAL64)
#ifdef CRAY_THREAD_DOCONCURRENT
do i=1,N
#else
do concurrent (i=1:N) !shared(A,B)
#endif
s = s + A(i) * B(i)
end do
end function dot
end module DoConcurrentStream

109
src/fortran/Makefile Normal file
View File

@ -0,0 +1,109 @@
ifeq ($(COMPILER),nvhpc)
include make.inc.nvhpc
else ifeq ($(COMPILER),oneapi)
include make.inc.oneapi
else ifeq ($(COMPILER),gcc)
include make.inc.gcc
else ifeq ($(COMPILER),amd)
include make.inc.amd
else ifeq ($(COMPILER),arm)
include make.inc.arm
else ifeq ($(COMPILER),cray)
include make.inc.cray
else ifeq ($(COMPILER),fj)
include make.inc.fj
else
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
include make.inc.gcc
COMPILER=gcc
endif
FCFLAGS += -DVERSION_STRING="5.0"
#FCFLAGS += -DUSE_INT32
ifeq ($(IMPLEMENTATION),DoConcurrent)
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
else ifeq ($(IMPLEMENTATION),Array)
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
IMPLEMENTATION_OBJECT = ArrayStream.o
else ifeq ($(IMPLEMENTATION),OpenMP)
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPStream.o
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
else ifeq ($(IMPLEMENTATION),OpenACC)
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCStream.o
else ifeq ($(IMPLEMENTATION),OpenACCArray)
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
else ifeq ($(IMPLEMENTATION),CUDA)
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAStream.o
else ifeq ($(IMPLEMENTATION),CUDAKernel)
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
else ifeq ($(IMPLEMENTATION),Sequential)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION_OBJECT = SequentialStream.o
else
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION=Sequential
IMPLEMENTATION_OBJECT = SequentialStream.o
endif
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
$(FC) $(FCFLAGS) -c $<
%.o: %.F90 BabelStreamTypes.mod
$(FC) $(FCFLAGS) -c $<
clean:
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
-rm -f *.modmic *.mod *.o *.cub *.ptx
realclean: clean
-rm -f BabelStream.*

View File

@ -0,0 +1,144 @@
module OpenACCArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenACCArray"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$acc kernels
A = initA
B = initB
C = initC
!$acc end kernels
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$acc kernels
h_A = A
h_B = B
h_C = C
!$acc end kernels
end subroutine read_arrays
subroutine copy()
implicit none
!$acc kernels
C = A
!$acc end kernels
end subroutine copy
subroutine add()
implicit none
!$acc kernels
C = A + B
!$acc end kernels
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
B = scalar * C
!$acc end kernels
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = B + scalar * C
!$acc end kernels
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = A + B + scalar * C
!$acc end kernels
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$acc kernels
s = dot_product(A,B)
!$acc end kernels
end function dot
end module OpenACCArrayStream

View File

@ -0,0 +1,161 @@
module OpenACCStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=7), parameter :: implementation_name = "OpenACC"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$acc parallel loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenACCStream

View File

@ -0,0 +1,137 @@
module OpenMPStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=6), parameter :: implementation_name = "OpenMP"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPStream

View File

@ -0,0 +1,162 @@
module OpenMPTargetLoopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetLoopStream

View File

@ -0,0 +1,163 @@
module OpenMPTargetStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenMPTarget"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams distribute parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i)
end do
!$omp barrier
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams distribute parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetStream

View File

@ -0,0 +1,169 @@
module OpenMPTaskloopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
!$omp end master
!$omp end parallel
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
!$omp end master
!$omp end parallel
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i)
end do
!$omp end master
!$omp end parallel
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i) + B(i)
end do
!$omp end master
!$omp end parallel
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
B(i) = scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel
!$omp master
!$omp taskloop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
!$omp end master
!$omp end parallel
end function dot
end module OpenMPTaskloopStream

View File

@ -0,0 +1,120 @@
module OpenMPWorkshareStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$omp parallel workshare
A = initA
B = initB
C = initC
!$omp end parallel workshare
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$omp parallel workshare
h_A = A
h_B = B
h_C = C
!$omp end parallel workshare
end subroutine read_arrays
subroutine copy()
implicit none
!$omp parallel workshare
C = A
!$omp end parallel workshare
end subroutine copy
subroutine add()
implicit none
!$omp parallel workshare
C = A + B
!$omp end parallel workshare
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
B = scalar * C
!$omp end parallel workshare
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = B + scalar * C
!$omp end parallel workshare
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = A + B + scalar * C
!$omp end parallel workshare
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$omp parallel workshare
s = dot_product(A,B)
!$omp end parallel workshare
end function dot
end module OpenMPWorkshareStream

View File

@ -0,0 +1,130 @@
module SequentialStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "Sequential"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module SequentialStream

54
src/fortran/build.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# uncomment to disable GPU targets
#HAS_GPU=0
# Orin
#if [ "x${compiler}" == "xgcc" ] ; then
# export MCPU=cortex-a78ae
#fi
#if [ "x${compiler}" == "xarm" ] ; then
# export MCPU=cortex-a78
#fi
COMPILERS="gcc"
if [ $(which nvfortran) ] ; then
COMPILERS="${COMPILERS} nvhpc"
fi
if [ $(which crayftn) ] ; then
COMPILERS="${COMPILERS} cray"
fi
if [ $(uname -m) == "aarch64" ] ; then
if [ $(which armflang) ] ; then
COMPILERS="${COMPILERS} arm"
fi
if [ $(which frt) ] ; then
COMPILERS="${COMPILERS} fj"
fi
elif [ $(uname -m) == "x86_64" ] ; then
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
COMPILERS="${COMPILERS} oneapi"
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
. /opt/intel/oneapi/setvars.sh >& /dev/null
fi
else
# ^ this detection can be improved
COMPILERS="${COMPILERS} amd"
fi
fi
for compiler in ${COMPILERS} ; do
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
if [ "${HAS_GPU}" != "0" ] ; then
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
if [ "x${compiler}" == "xnvhpc" ] ; then
TARGETS="${TARGETS} CUDA CUDAKernel"
fi
fi
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
TARGETS="${TARGETS} OpenACC OpenACCArray"
fi
for implementation in ${TARGETS} ; do
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
done
done

683
src/fortran/main.F90 Normal file
View File

@ -0,0 +1,683 @@
module BabelStreamUtil
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
use BabelStreamTypes
implicit none
integer(kind=StreamIntKind) :: array_size = 33554432
integer(kind=StreamIntKind) :: num_times = 100
logical :: mibibytes = .false.
logical :: use_gigs = .false.
logical :: csv = .false.
character(len=1), parameter :: csv_sep = ","
! 1 = All
! 2 = Triad
! 3 = Nstream
integer :: selection = 1
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
contains
function get_wtime() result(t)
#if defined(USE_OMP_GET_WTIME)
use omp_lib
implicit none
real(kind=REAL64) :: t
t = omp_get_wtime()
#elif defined(USE_CPU_TIME)
implicit none
real(kind=REAL64) :: t
real :: r
call cpu_time(r)
t = r
#else
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
#endif
end function get_wtime
subroutine parseArguments()
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream, only: list_devices, set_device
#elif defined(USE_ARRAY)
use ArrayStream, only: list_devices, set_device
#elif defined(USE_OPENMP)
use OpenMPStream, only: list_devices, set_device
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream, only: list_devices, set_device
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream, only: list_devices, set_device
#elif defined(USE_OPENACC)
use OpenACCStream, only: list_devices, set_device
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream, only: list_devices, set_device
#elif defined(USE_CUDA)
use CUDAStream, only: list_devices, set_device
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream, only: list_devices, set_device
#elif defined(USE_SEQUENTIAL)
use SequentialStream, only: list_devices, set_device
#endif
implicit none
integer :: i, argc
integer :: arglen,err,pos(2)
character(len=64) :: argtmp
argc = command_argument_count()
do i=1,argc
call get_command_argument(i,argtmp,arglen,err)
if (err.eq.0) then
!
! list devices
!
pos(1) = index(argtmp,"--list")
if (pos(1).eq.1) then
call list_devices()
stop
endif
!
! set device number
!
pos(1) = index(argtmp,"--device")
if (pos(1).eq.1) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
stop
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer :: dev
read(argtmp,'(i15)') dev
call set_device(dev)
end block
endif
cycle
endif
!
! array size
!
pos(1) = index(argtmp,"--arraysize")
pos(2) = index(argtmp,"-s")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer(kind=INT64) :: big_size
read(argtmp,'(i15)') big_size
if (big_size .gt. HUGE(array_size)) then
print*,'Array size does not fit into integer:'
print*,big_size,'>',HUGE(array_size)
print*,'Stop using USE_INT32'
stop
else
array_size = INT(big_size,kind=StreamIntKind)
endif
end block
endif
cycle
endif
!
! number of iterations
!
pos(1) = index(argtmp,"--numtimes")
pos(2) = index(argtmp,"-n")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
read(argtmp,'(i15)') num_times
if (num_times.lt.2) then
write(*,'(a)') "Number of times must be 2 or more"
stop
end if
endif
cycle
endif
!
! precision
!
pos(1) = index(argtmp,"--float")
if (pos(1).eq.1) then
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
"to run BabelStream in single precision."
stop
endif
!
! selection (All, Triad, Nstream)
!
pos(1) = index(argtmp,"--triad-only")
if (pos(1).eq.1) then
selection = 2
cycle
endif
pos(1) = index(argtmp,"--nstream-only")
if (pos(1).eq.1) then
selection = 3
cycle
endif
!
! CSV
!
pos(1) = index(argtmp,"--csv")
if (pos(1).eq.1) then
csv = .true.
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
!stop
endif
!
! units
!
pos(1) = index(argtmp,"--mibibytes")
if (pos(1).eq.1) then
mibibytes = .true.
cycle
endif
!
! giga/gibi instead of mega/mebi
!
pos(1) = index(argtmp,"--gigs")
if (pos(1).eq.1) then
use_gigs = .true.
cycle
endif
!
!
!
pos(1) = index(argtmp,"--compiler-info")
if (pos(1).eq.1) then
write(*,'(a)') 'Compiler version: ',compiler_version()
write(*,'(a)') 'Compiler options: ',compiler_options()
stop
endif
!
! help
!
pos(1) = index(argtmp,"--help")
pos(2) = index(argtmp,"-h")
if (any(pos(:).eq.1) ) then
call get_command_argument(0,argtmp,arglen,err)
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
write(*,'(a)') "Options:"
write(*,'(a)') " -h --help Print the message"
write(*,'(a)') " --list List available devices"
write(*,'(a)') " --device INDEX Select device at INDEX"
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
!write(*,'(a)') " --float Use floats (rather than doubles)"
write(*,'(a)') " --triad-only Only run triad"
write(*,'(a)') " --nstream-only Only run nstream"
write(*,'(a)') " --csv Output as csv table"
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
stop
endif
end if
end do
end subroutine parseArguments
subroutine run_all(timings, summ)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64), intent(out) :: summ
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call copy()
t2 = get_wtime()
timings(1,i) = t2-t1
t1 = get_wtime()
call mul(startScalar)
t2 = get_wtime()
timings(2,i) = t2-t1
t1 = get_wtime()
call add()
t2 = get_wtime()
timings(3,i) = t2-t1
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(4,i) = t2-t1
t1 = get_wtime()
summ = dot()
t2 = get_wtime()
timings(5,i) = t2-t1
end do
end subroutine run_all
subroutine run_triad(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_triad
subroutine run_nstream(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call nstream(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_nstream
subroutine check_solution(A, B, C, summ)
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
implicit none
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
real(kind=REAL64), intent(in) :: summ
integer(kind=StreamIntKind) :: i
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
real(kind=REAL64) :: scalar
! always use double because of accumulation error
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
logical :: cleanA, cleanB, cleanC, cleanSum
goldA = startA
goldB = startB
goldC = startC
goldSum = 0.0d0
scalar = startScalar
do i=1,num_times
if (selection.eq.1) then
goldC = goldA
goldB = scalar * goldC
goldC = goldA + goldB
goldA = goldB + scalar * goldC
else if (selection.eq.2) then
goldA = goldB + scalar * goldC
else if (selection.eq.3) then
goldA = goldA + goldB + scalar * goldC;
endif
end do
goldSum = goldA * goldB * array_size
cleanA = ALL(IEEE_Is_Normal(A))
cleanB = ALL(IEEE_Is_Normal(B))
cleanC = ALL(IEEE_Is_Normal(C))
cleanSum = IEEE_Is_Normal(summ)
if (.not. cleanA) then
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanB) then
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanC) then
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanSum) then
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
end if
errA = SUM( ABS( A - goldA ) ) / array_size
errB = SUM( ABS( B - goldB ) ) / array_size
errC = SUM( ABS( C - goldC ) ) / array_size
errSum = ABS( (summ - goldSum) / goldSum)
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
if (errA .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
end if
if (errB .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
end if
if (errC .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
end if
if (selection.eq.1) then
if (errSum .gt. 1.0e-8) then
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
end if
endif
end subroutine check_solution
end module BabelStreamUtil
program BabelStream
use BabelStreamUtil
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
integer :: element_size, err
real(kind=REAL64) :: scaling
character(len=3) :: label
real(kind=REAL64), allocatable :: timings(:,:)
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
real(kind=REAL64) :: summ
real(kind=REAL64) :: init_tic, init_toc, read_tic, read_toc
call parseArguments()
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
if (mibibytes) then
if (use_gigs) then
scaling = 2.0d0**(-30)
label = "GiB"
else
scaling = 2.0d0**(-20)
label = "MiB"
endif
else
if (use_gigs) then
scaling = 1.0d-9
label = "GB"
else
scaling = 1.0d-6
label = "MB"
endif
endif
if (.not.csv) then
write(*,'(a)') "BabelStream Fortran"
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
write(*,'(a16,a)') "Implementation: ", implementation_name
block
character(len=32) :: printout
write(printout,'(i9,1x,a5)') num_times,'times'
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
end block
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
endif ! csv
allocate( timings(5,num_times) )
call alloc(array_size)
init_tic = get_wtime()
call init_arrays(startA, startB, startC)
init_toc = get_wtime()
summ = 0.0d0
if (.not.csv) then
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Init: ',init_toc-init_tic, 's (=', &
(3.0d0 * element_size * array_size * scaling) / (init_toc-init_tic), TRIM(label), 'ytes/sec)'
end if
timings = -1.0d0
if (selection.eq.1) then
call run_all(timings, summ)
else if (selection.eq.2) then
call run_triad(timings)
else if (selection.eq.3) then
call run_nstream(timings)
endif
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
read_tic = get_wtime()
call read_arrays(h_A, h_B, h_C)
read_toc = get_wtime()
if (.not.csv) then
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Read: ',read_toc-read_tic, 's (=', &
(3.0d0 * element_size * array_size * scaling) / (read_toc-read_tic), TRIM(label), 'ytes/sec)'
end if
call check_solution(h_A, h_B, h_C, summ)
block
character(len=20) :: printout(8)
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
if (csv) then
write(*,'(a,a1)',advance='no') 'function', csv_sep
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
if (mibibytes) then
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
else
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
endif
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
write(*,'(a,a1)',advance='yes') 'avg_runtime'
else
write(printout(1),'(a8)') 'Function'
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
write(printout(3),'(a9)') 'Min (sec)'
write(printout(4),'(a3)') 'Max'
write(printout(5),'(a7)') 'Average'
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif ! csv
if (selection.eq.1) then
block
integer, parameter :: sizes(5) = [2,2,3,3,2]
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
integer :: i
do i=1,5
tmin = MINVAL(timings(i,2:num_times))
tmax = MAXVAL(timings(i,2:num_times))
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
write(printout(1),'(a)') labels(i)
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
enddo
end block
else if ((selection.eq.2).or.(selection.eq.3)) then
tmin = MINVAL(timings(1,2:num_times))
tmax = MAXVAL(timings(1,2:num_times))
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
if (selection.eq.2) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
write(printout(1),'(a12)') "Triad"
else if (selection.eq.3) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
write(printout(1),'(a12)') "Nstream"
endif
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
endif
end block
call dealloc()
end program BabelStream

25
src/fortran/make.inc.amd Normal file
View File

@ -0,0 +1,25 @@
FC := /opt/rocm/llvm/bin/flang
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
FCFLAGS := -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
ARRAY_FLAG = -fopenmp # libomp.so required
OPENMP_FLAG = -fopenmp
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

39
src/fortran/make.inc.arm Normal file
View File

@ -0,0 +1,39 @@
FC = armflang
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -fopenmp
ARRAY_FLAG = -fopenmp
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

18
src/fortran/make.inc.cray Normal file
View File

@ -0,0 +1,18 @@
FC := ftn
FCFLAGS = -e F -O3
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
ARRAY_FLAG = -h autothread
OPENMP_FLAG = -h omp
OPENACC_FLAG = -h acc
# CPU only
OPENACC_FLAG += -h omp
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

21
src/fortran/make.inc.fj Normal file
View File

@ -0,0 +1,21 @@
FC := frt
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
ARRAY_FLAG = -Kparallel,reduction
OPENMP_FLAG = -fopenmp
OPENACC_FLAG =
# CPU only
OPENACC_FLAG +=
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OPENACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

33
src/fortran/make.inc.gcc Normal file
View File

@ -0,0 +1,33 @@
FC = gfortran
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
ARRAY_FLAG =
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

View File

@ -0,0 +1,70 @@
FC := nvfortran
#FCFLAGS := -O3 -Minform=inform -Minfo=all
FCFLAGS := -O3 -Minform=warn
#TARGET=gpu
TARGET=multicore
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
$(info Ampere detected)
GPU = cc80
endif
ifeq ($(findstring Turing,$(NVARCH)),Turing)
$(info Turing detected)
GPU = cc75
endif
ifeq ($(findstring Volta,$(NVARCH)),Volta)
$(info Volta detected)
GPU = cc70
endif
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
$(info Pascal detected)
GPU = cc60,cc61
endif
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
$(info Jetson AGX Orin detected)
GPU = ccn87,cc86
# figure out Xavier later
#GPU = cc72
endif
ifeq ($(GPU),)
$(error Your GPU architecture could not be detected. Set it manually.)
endif
GPUFLAG = -gpu=$(GPU)
# MARCH=neoverse-v1,neoverse-n1,zen3
ARCH=$(shell uname -m)
ifdef MARCH
ifeq ($(ARCH),aarch64)
ifeq ($(MARCH),neoverse-n1)
FCFLAGS += -tp=$(MARCH)
else
ifeq ($(MARCH),neoverse-v1)
FCFLAGS += -tp=$(MARCH)
else
FCFLAGS += -tp=native
endif
endif
else
FCFLAGS += -tp=$(MARCH)
endif
else
FCFLAGS += -tp=native
endif
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
# set exactly one of these!
#MANAGED = -DUSE_MANAGED -gpu=managed
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
endif

View File

@ -0,0 +1,32 @@
FC := ifx
FCFLAGS = -std18
FCFLAGS += -Ofast -xHOST
FCFLAGS += -qopt-zmm-usage=low
ifeq ($(FC),ifort)
FCFLAGS += -qopt-streaming-stores=always
PARALLEL = -parallel
endif
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
ARRAY_FLAG = -qopenmp $(PARALLEL)
OPENMP_FLAG = -qopenmp
ifeq ($(FC),ifx)
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
endif
OPENACC_FLAG =
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

35
src/fortran/run.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
cat ./run.sh
if [ `uname -s` == Darwin ] ; then
NUM_HWTHREADS=`sysctl -n hw.ncpu`
MEMORY_BYTES=`sysctl -n hw.memsize`
else
NUM_HWTHREADS=`nproc`
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
fi
M=128
export OMP_NUM_THREADS=8
export OMP_PROC_BIND=close
export OMP_PLACES=threads
export ACC_NUM_CORES=${OMP_NUM_THREADS}
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
#if [ "x$compiler" == "xgcc" ] ; then
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
#fi
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
if [ -f BabelStream.${compiler}.${implementation} ] ; then
echo "BabelStream.${compiler}.${implementation}"
ldd BabelStream.${compiler}.${implementation}
time $AFFCONTROL \
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
fi
done
done

View File

@ -0,0 +1,212 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
// Copyright (c) 2022 Troels Henriksen
// University of Copenhagen
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include <cstdlib> // For aligned_alloc
#include <string>
#include "FutharkStream.h"
template <class T>
FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
{
this->array_size = ARRAY_SIZE;
this->cfg = futhark_context_config_new();
this->device = "#" + std::to_string(device);
#if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
futhark_context_config_set_device(cfg, this->device.c_str());
#endif
this->ctx = futhark_context_new(cfg);
this->a = NULL;
this->b = NULL;
this->c = NULL;
}
template <>
FutharkStream<float>::~FutharkStream()
{
if (this->a) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->a);
}
if (this->b) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
}
if (this->c) {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
}
futhark_context_free(this->ctx);
futhark_context_config_free(this->cfg);
}
template <>
FutharkStream<double>::~FutharkStream()
{
if (this->a) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
}
if (this->b) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
}
if (this->c) {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
}
futhark_context_free(this->ctx);
futhark_context_config_free(this->cfg);
}
template <>
void FutharkStream<float>::init_arrays(float initA, float initB, float initC) {
int array_size = this->array_size;
float *a = new float[array_size];
float *b = new float[array_size];
float *c = new float[array_size];
for (int i = 0; i < array_size; i++) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
this->a = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, a, array_size);
this->b = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, b, array_size);
this->c = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, c, array_size);
futhark_context_sync(this->ctx);
delete[] a;
delete[] b;
delete[] c;
}
template <>
void FutharkStream<double>::init_arrays(double initA, double initB, double initC) {
int array_size = this->array_size;
double *a = new double[array_size];
double *b = new double[array_size];
double *c = new double[array_size];
for (int i = 0; i < array_size; i++) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
this->a = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, a, array_size);
this->b = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, b, array_size);
this->c = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, c, array_size);
futhark_context_sync(this->ctx);
delete[] a;
delete[] b;
delete[] c;
}
template <>
void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::copy() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_copy(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::copy() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
futhark_entry_f64_copy(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::mul() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
futhark_entry_f32_mul(this->ctx, (futhark_f32_1d**)&this->b, (futhark_f32_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::mul() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
futhark_entry_f64_mul(this->ctx, (futhark_f64_1d**)&this->b, (futhark_f64_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::add() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_add(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::add() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
futhark_entry_f64_add(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::triad() {
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
futhark_entry_f32_triad(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::triad() {
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
futhark_entry_f64_triad(this->ctx, (futhark_f64_1d**)&this->a, (futhark_f64_1d*)this->b, (futhark_f64_1d*)this->c);
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<float>::nstream() {
futhark_f32_1d* d;
futhark_entry_f32_triad(this->ctx, &d, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
this->c = d;
futhark_context_sync(this->ctx);
}
template <>
void FutharkStream<double>::nstream() {
futhark_f64_1d* d;
futhark_entry_f64_triad(this->ctx, &d, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
this->c = d;
futhark_context_sync(this->ctx);
}
template <>
float FutharkStream<float>::dot() {
float res;
futhark_entry_f32_dot(this->ctx, &res, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
futhark_context_sync(this->ctx);
return res;
}
template <>
double FutharkStream<double>::dot() {
double res;
futhark_entry_f64_dot(this->ctx, &res, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
futhark_context_sync(this->ctx);
return res;
}
void listDevices(void)
{
std::cout << "Device selection not supported." << std::endl;
}
template class FutharkStream<float>;
template class FutharkStream<double>;

View File

@ -0,0 +1,60 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
// Copyright (c) 2022 Troels Henriksen
// University of Copenhagen
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#include "babelstream.h"
#if defined(FUTHARK_BACKEND_c)
#define IMPLEMENTATION_STRING "Futhark (sequential)"
#elif defined(FUTHARK_BACKEND_multicore)
#define IMPLEMENTATION_STRING "Futhark (parallel CPU)"
#elif defined(FUTHARK_BACKEND_opencl)
#define IMPLEMENTATION_STRING "Futhark (OpencL)"
#elif defined(FUTHARK_BACKEND_cuda)
#define IMPLEMENTATION_STRING "Futhark (CUDA)"
#else
#define IMPLEMENTATION_STRING "Futhark (unknown backend)"
#endif
template <class T>
class FutharkStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
// For device selection.
std::string device;
// Futhark stuff
struct futhark_context_config *cfg;
struct futhark_context *ctx;
// Device side arrays
void* a;
void* b;
void* c;
public:
FutharkStream(const int, int);
~FutharkStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -0,0 +1,62 @@
module type kernels = {
type t
val copy [n] : [n]t -> *[n]t
val mul [n] : t -> [n]t -> [n]t
val add [n] : [n]t -> [n]t -> [n]t
val triad [n] : t -> [n]t -> [n]t -> [n]t
val dot [n] : [n]t -> [n]t -> t
-- Uniqueness allows nstream to mutate the 'a' array.
val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t
}
module kernels (P: real) : kernels with t = P.t = {
type t = P.t
def copy = copy
def mul scalar c = map (P.*scalar) c
def add = map2 (P.+)
def triad scalar b c = map2 (P.+) b (map (P.* scalar) c)
def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b)
def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c))
}
module f32_kernels = kernels f32
def f32_start_scalar : f32 = 0.4
entry f32_copy = f32_kernels.copy
entry f32_mul = f32_kernels.mul f32_start_scalar
entry f32_add = f32_kernels.add
entry f32_triad = f32_kernels.triad f32_start_scalar
entry f32_nstream = f32_kernels.nstream f32_start_scalar
entry f32_dot = f32_kernels.dot
module f64_kernels = kernels f64
def f64_start_scalar : f64 = 0.4
entry f64_copy = f64_kernels.copy
entry f64_mul = f64_kernels.mul f64_start_scalar
entry f64_add = f64_kernels.add
entry f64_triad = f64_kernels.triad f64_start_scalar
entry f64_nstream = f64_kernels.nstream f64_start_scalar
entry f64_dot = f64_kernels.dot
-- ==
-- entry: f32_copy f32_mul
-- random input { [33554432]f32 }
-- ==
-- entry: f32_add f32_dot f32_triad
-- random input { [33554432]f32 [33554432]f32 }
-- ==
-- entry: f32_nstream
-- random input { [33554432]f32 [33554432]f32 [33554432]f32 }
-- ==
-- entry: f64_copy f64_mul
-- random input { [33554432]f64 }
-- ==
-- entry: f64_add f64_dot f64_triad
-- random input { [33554432]f64 [33554432]f64 }
-- ==
-- entry: f64_nstream
-- random input { [33554432]f64 [33554432]f64 [33554432]f64 }

55
src/futhark/model.cmake Normal file
View File

@ -0,0 +1,55 @@
# Use
#
# cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark
#
# to use the Futhark backend, where 'foo' must be one of 'multicore',
# 'c', 'opencl', or 'cuda'. Defaults to 'multicore'.
#
# Use -DFUTHARK_COMPILER to set the path to the Futhark compiler
# binary. Defaults to 'futhark' on the PATH.
register_flag_optional(FUTHARK_BACKEND
"Use a specific Futhark backend, possible options are:
- c
- multicore
- opencl
- cuda"
"multicore")
register_flag_optional(FUTHARK_COMPILER
"Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH"
"futhark")
macro(setup)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/babelstream.c
${CMAKE_CURRENT_BINARY_DIR}/babelstream.h
COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND}
--library src/futhark/babelstream.fut
-o ${CMAKE_CURRENT_BINARY_DIR}/babelstream
DEPENDS src/futhark/babelstream.fut
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
VERBATIM
)
if (${FUTHARK_BACKEND} STREQUAL "c")
# Nothing to do.
elseif (${FUTHARK_BACKEND} STREQUAL "multicore")
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
register_link_library(Threads::Threads)
elseif (${FUTHARK_BACKEND} STREQUAL "opencl")
find_package(OpenCL REQUIRED)
register_link_library(OpenCL::OpenCL)
elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
find_package(CUDA REQUIRED)
register_link_library("nvrtc" "cuda" "cudart")
else ()
message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
endif()
endmacro()
macro(setup_target)
target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c")
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
endmacro()

View File

@ -9,7 +9,7 @@
#include "hip/hip_runtime.h"
#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
void check_error(void)
{
@ -45,34 +45,63 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
// Print out device information
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif
array_size = ARRAY_SIZE;
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
size_t array_bytes = sizeof(T);
array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 3;
// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
check_error();
// Check buffers fit on the device
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers
hipMalloc(&d_a, ARRAY_SIZE*sizeof(T));
// Create device buffers
#if defined(MANAGED)
hipMallocManaged(&d_a, array_bytes);
check_error();
hipMalloc(&d_b, ARRAY_SIZE*sizeof(T));
hipMallocManaged(&d_b, array_bytes);
check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
hipMallocManaged(&d_c, array_bytes);
check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
#elif defined(PAGEFAULT)
d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(array_bytes);
#else
hipMalloc(&d_a, array_bytes);
check_error();
hipMalloc(&d_b, array_bytes);
check_error();
hipMalloc(&d_c, array_bytes);
check_error();
#endif
}
template <class T>
HIPStream<T>::~HIPStream()
{
free(sums);
hipHostFree(sums);
check_error();
hipFree(d_a);
check_error();
@ -80,15 +109,13 @@ HIPStream<T>::~HIPStream()
check_error();
hipFree(d_c);
check_error();
hipFree(d_sum);
check_error();
}
template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = initA;
b[i] = initB;
c[i] = initC;
@ -97,7 +124,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
check_error();
hipDeviceSynchronize();
check_error();
@ -106,27 +133,37 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
template <class T>
void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// Copy device memory to host
#if defined(PAGEFAULT) || defined(MANAGED)
hipDeviceSynchronize();
for (int i = 0; i < array_size; i++)
{
a[i] = d_a[i];
b[i] = d_b[i];
c[i] = d_c[i];
}
#else
hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
#endif
}
template <typename T>
__global__ void copy_kernel(const T * a, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i];
}
template <class T>
void HIPStream<T>::copy()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -136,14 +173,14 @@ template <typename T>
__global__ void mul_kernel(T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
b[i] = scalar * c[i];
}
template <class T>
void HIPStream<T>::mul()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -152,14 +189,14 @@ void HIPStream<T>::mul()
template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}
template <class T>
void HIPStream<T>::add()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -169,14 +206,14 @@ template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::triad()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -186,32 +223,32 @@ template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::nstream()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <class T>
template <typename T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{
__shared__ T tb_sum[TBSIZE];
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t local_i = hipThreadIdx_x;
const size_t local_i = threadIdx.x;
size_t i = blockDim.x * blockIdx.x + local_i;
tb_sum[local_i] = 0.0;
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
tb_sum[local_i] = {};
for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i];
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
{
__syncthreads();
if (local_i < offset)
@ -221,20 +258,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
}
if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i];
sum[blockIdx.x] = tb_sum[local_i];
}
template <class T>
T HIPStream<T>::dot()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
check_error();
hipDeviceSynchronize();
check_error();
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
check_error();
T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
T sum{};
for (int i = 0; i < dot_num_blocks; i++)
sum += sums[i];
return sum;

View File

@ -14,13 +14,31 @@
#include "Stream.h"
#define IMPLEMENTATION_STRING "HIP"
#define DOT_READ_DWORDS_PER_LANE 4
template <class T>
class HIPStream : public Stream<T>
{
// Make sure that either:
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
// or
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
// Take into account the datatype size
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
// and 4 FP32 elements
static constexpr unsigned int dot_elements_per_lane{
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
protected:
// Size of arrays
int array_size;
int dot_num_blocks;
// Host array for partial sums for dot kernel
T *sums;
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
T *d_a;
T *d_b;
T *d_c;
T *d_sum;
public:

View File

@ -2,6 +2,13 @@
register_flag_required(CMAKE_CXX_COMPILER
"Absolute path to the AMD HIP C++ compiler")
register_flag_optional(MEM "Device memory mode:
DEFAULT - allocate host and device memory pointers.
MANAGED - use HIP Managed Memory.
PAGEFAULT - shared memory, only host pointers allocated."
"DEFAULT")
macro(setup)
# nothing to do here as hipcc does everything correctly, what a surprise!
register_definitions(${MEM})
endmacro()

View File

@ -7,12 +7,12 @@
<artifactId>java-stream</artifactId>
<groupId>javastream</groupId>
<version>4.0</version>
<version>5.0</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<junit.version>5.7.2</junit.version>
<junit.version>5.9.2</junit.version>
</properties>
<repositories>
@ -27,19 +27,19 @@
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.81</version>
<version>1.82</version>
</dependency>
<dependency>
<groupId>tornado</groupId>
<artifactId>tornado-api</artifactId>
<version>0.9</version>
<version>0.15.1</version>
</dependency>
<dependency>
<groupId>com.aparapi</groupId>
<artifactId>aparapi</artifactId>
<version>2.0.0</version>
<version>3.0.0</version>
<exclusions>
<!-- don't pull in the entire Scala ecosystem! -->
<exclusion>

View File

@ -56,7 +56,7 @@ public abstract class JavaStream<T> {
protected abstract T dot();
protected abstract Data<T> data();
protected abstract Data<T> readArrays();
public static class EnumeratedStream<T> extends JavaStream<T> {
@ -113,8 +113,8 @@ public abstract class JavaStream<T> {
}
@Override
public Data<T> data() {
return actual.data();
public Data<T> readArrays() {
return actual.readArrays();
}
}
@ -140,6 +140,14 @@ public abstract class JavaStream<T> {
return Duration.ofNanos(end - start);
}
final Duration runInitArrays() {
return timed(this::initArrays);
}
final SimpleImmutableEntry<Duration, Data<T>> runReadArrays() {
return timed(this::readArrays);
}
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
Timings<Duration> timings = new Timings<>();
T lastSum = null;

View File

@ -128,6 +128,40 @@ public class Main {
}
}
@SuppressWarnings("unchecked")
static void showInit(
int totalBytes, double megaScale, Options opt, Duration init, Duration read) {
List<Entry<String, Double>> setup =
Arrays.asList(
new SimpleImmutableEntry<>("Init", durationToSeconds(init)),
new SimpleImmutableEntry<>("Read", durationToSeconds(read)));
if (opt.csv) {
tabulateCsv(
true,
setup.stream()
.map(
x ->
Arrays.asList(
new SimpleImmutableEntry<>("function", x.getKey()),
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
new SimpleImmutableEntry<>(
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec",
((megaScale * (double) totalBytes / x.getValue())) + ""),
new SimpleImmutableEntry<>("runtime", x.getValue() + "")))
.toArray(List[]::new));
} else {
for (Entry<String, Double> e : setup) {
System.out.printf(
"%s: %.5f s (%.5f M%sBytes/sec)%n",
e.getKey(),
e.getValue(),
megaScale * (double) totalBytes / e.getValue(),
opt.mibibytes ? "i" : "");
}
}
}
static <T extends Number> boolean run(
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
@ -183,35 +217,46 @@ public class Main {
JavaStream<T> stream = mkStream.apply(config);
stream.initArrays();
Duration init = stream.runInitArrays();
final boolean ok;
switch (config.benchmark) {
case ALL:
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
{
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
}
case NSTREAM:
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
{
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
}
case TRIAD:
Duration triadResult = stream.runTriad(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
{
Duration triadResult = stream.runTriad(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
}
default:
throw new AssertionError();
}
@ -337,7 +382,7 @@ public class Main {
}
}
private static final String VERSION = "4.0";
private static final String VERSION = "5.0";
private static final float START_SCALAR = 0.4f;
private static final float START_A = 0.1f;

View File

@ -122,7 +122,7 @@ public final class AparapiStreams {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return kernels.syncAndDispose();
}
}

View File

@ -86,7 +86,7 @@ final class GenericPlainStream<T extends Number> extends JavaStream<T> {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return new Data<>(a, b, c);
}
}

View File

@ -80,7 +80,7 @@ final class GenericStream<T extends Number> extends JavaStream<T> {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return new Data<>(a, b, c);
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedDoubleStream extends JavaStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedFloatStream extends JavaStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainFloatStream extends JavaStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -4,8 +4,8 @@ import java.util.List;
import java.util.stream.Collectors;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskSchedule;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
protected final TornadoDevice device;
protected TaskSchedule copyTask;
protected TaskSchedule mulTask;
protected TaskSchedule addTask;
protected TaskSchedule triadTask;
protected TaskSchedule nstreamTask;
protected TaskSchedule dotTask;
protected TornadoExecutionPlan copyTask;
protected TornadoExecutionPlan mulTask;
protected TornadoExecutionPlan addTask;
protected TornadoExecutionPlan triadTask;
protected TornadoExecutionPlan nstreamTask;
protected TornadoExecutionPlan dotTask;
GenericTornadoVMStream(Config<T> config) {
super(config);
try {
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime();
TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime();
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
device = devices.get(config.options.device);
@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
}
}
protected static TaskSchedule mkSchedule() {
return new TaskSchedule("");
}
@Override
public List<String> listDevices() {
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
@ -55,12 +51,12 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
@Override
public void initArrays() {
this.copyTask.warmup();
this.mulTask.warmup();
this.addTask.warmup();
this.triadTask.warmup();
this.nstreamTask.warmup();
this.dotTask.warmup();
this.copyTask.withWarmUp();
this.mulTask.withWarmUp();
this.addTask.withWarmUp();
this.triadTask.withWarmUp();
this.nstreamTask.withWarmUp();
this.dotTask.withWarmUp();
}
@Override

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
@ -49,7 +52,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
private final double[] a, b, c;
private final double[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
@SuppressWarnings({"DuplicatedCode"})
SpecialisedDouble(Config<Double> config) {
super(config);
final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
b = new double[size];
c = new double[size];
dotSum = new double[1];
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum);
this.copyTask =
new TornadoExecutionPlan(
new TaskGraph("copy")
.task("copy", SpecialisedDouble::copy, size, a, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
.snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedDouble::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedDouble::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedDouble::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedDouble::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
}
@Override
@ -72,7 +106,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
}
@Override
@ -81,7 +115,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
@ -49,7 +52,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
private final float[] a, b, c;
private final float[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
@SuppressWarnings({"DuplicatedCode"})
SpecialisedFloat(Config<Float> config) {
super(config);
final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
b = new float[size];
c = new float[size];
dotSum = new float[1];
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum);
this.copyTask =
new TornadoExecutionPlan(
new TaskGraph("copy")
.task("copy", SpecialisedFloat::copy, size, a, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
.snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedFloat::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedFloat::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedFloat::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedFloat::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
}
@Override
@ -72,7 +106,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
}
@Override
@ -81,7 +115,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}

View File

@ -1,36 +1,46 @@
package javastream.tornadovm;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState;
import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
public final class TornadoVMStreams {
private TornadoVMStreams() {}
static void xferToDevice(TornadoDevice device, Object... xs) {
static void allocAndXferToDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.allocateObjects(
new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)});
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
}
}
static void xferFromDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn();
}
Arrays.stream(xs)
.map(
x -> {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
return device.resolveEvent(
device.streamOut(x, 0, state.getDeviceState(device), null));
})
.collect(Collectors.toList())
.forEach(Event::waitOn);
}
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) {
static List<TornadoDevice> enumerateDevices(TornadoRuntimeInterface runtime) {
return IntStream.range(0, runtime.getNumDrivers())
.mapToObj(runtime::getDriver)
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))

View File

@ -1,415 +1,423 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"]
git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7"
julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "05982ec0602af8ada9509107382dd6c8b21db9b9"
[[deps.AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Preferences", "Printf", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "95437cf4c0ad651ca8463475de8af6a6935e23bd"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.17"
version = "0.6.1"
[[AbstractFFTs]]
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
version = "1.5.0"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"
[deps.AbstractFFTs.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
version = "3.6.2"
weakdeps = ["StaticArrays"]
[[ArgParse]]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]]
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
[[deps.CEnum]]
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
version = "0.4.2"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"
[[Dates]]
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
[[deps.DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.9.3"
[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
version = "0.1.10"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2"
version = "9.0.0"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
version = "0.24.5"
[[HIP_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
[[deps.IrrationalConstants]]
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.2.2"
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
version = "1.5.0"
[[LLVM]]
[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLD_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109"
version = "14.0.6+3"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.7.0"
version = "6.2.1"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.13+0"
version = "0.0.25+0"
[[LibCURL]]
[[deps.LLVM_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c"
uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c"
version = "14.0.6+4"
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]]
[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]]
[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]]
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
[[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.26"
[deps.LogExpFunctions.extensions]
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
[deps.LogExpFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf"
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.9"
version = "0.5.11"
[[Markdown]]
[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[MozillaCACerts_jll]]
[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[deps.OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
version = "0.8.1+0"
[[deps.OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[deps.OrderedCollections]]
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
version = "1.6.2"
[[Parameters]]
[[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]]
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
version = "1.4.1"
[[Printf]]
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCmCompilerSupport_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Requires]]
[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a"
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[Sockets]]
[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[Statistics]]
[[deps.SpecialFunctions]]
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "2.3.1"
[deps.SpecialFunctions.extensions]
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
[deps.SpecialFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.6.4"
weakdeps = ["Statistics"]
[deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]]
[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]]
[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[TextWrap]]
[[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13"
version = "0.5.23"
[[UUIDs]]
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
[[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"
[[XSLT_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
version = "1.1.34+0"
[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[deps.libLLVM_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a"
version = "14.0.6+3"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.2.0+0"
[[nghttp2_jll]]
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"
julia = "1.9"

View File

@ -1,332 +1,555 @@
# This file is machine-generated - editing it directly is not advised
[[AbstractFFTs]]
julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "6909ef39c97ad6037791040bed70b7aa111e1f64"
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
version = "1.5.0"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"
[deps.AbstractFFTs.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
version = "3.6.2"
weakdeps = ["StaticArrays"]
[[ArgParse]]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]]
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.2.0"
[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[Base64]]
[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.4.2"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
[[deps.CEnum]]
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
version = "0.4.2"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "1f8ebf85abb7d1eff965730e592794a27c1350d8"
[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.6.0"
version = "5.0.0"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.11.2"
[deps.CUDA.extensions]
SpecialFunctionsExt = "SpecialFunctions"
[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[deps.CUDA.weakdeps]
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "35a37bb72b35964f2895c12c687ae263b4ac170c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.6.0+3"
[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.2"
[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "bfe5a693a11522d58392f742243f2b50dc27afd6"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.9.2+0"
[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"
[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"
[[deps.Compat]]
deps = ["UUIDs"]
git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.41.0"
version = "4.9.0"
weakdeps = ["Dates", "LinearAlgebra"]
[[CompilerSupportLibraries_jll]]
[deps.Compat.extensions]
CompatLinearAlgebraExt = "LinearAlgebra"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"
[[Dates]]
[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"
[[deps.DataAPI]]
git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.15.0"
[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"
[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.15"
[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
version = "0.1.10"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"
[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2"
version = "9.0.0"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a"
[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.13.10"
version = "0.24.5"
[[InteractiveUtils]]
[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InverseFunctions]]
deps = ["Test"]
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"
[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
[[deps.InvertedIndices]]
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.7.0"
[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"
[[LLVMExtra_jll]]
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"
[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.13+0"
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"
[[LazyArtifacts]]
[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.2.1"
[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.25+0"
[[deps.LaTeXStrings]]
git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.0"
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]]
[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]]
[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]]
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6"
[[Logging]]
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.11"
[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.1.0"
[[MozillaCACerts_jll]]
[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NetworkOptions]]
[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.3"
[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
[[deps.OrderedCollections]]
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
version = "1.6.2"
[[Parameters]]
[[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.7.2"
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]]
[[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
version = "1.4.1"
[[Printf]]
[[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.2.7"
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
version = "1.6.1"
[[RandomNumbers]]
[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a"
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.0"
[[Serialization]]
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.1.1"
[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "2.0.0"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.6.4"
weakdeps = ["Statistics"]
[[Statistics]]
[deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]]
[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"
[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]]
[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"
[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "a1f34829d5ac0ef499f6d84428bd6b4c71f02ead"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.0"
[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[Test]]
[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
[[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13"
version = "0.5.23"
[[UUIDs]]
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
[[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"
[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[nghttp2_jll]]
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[p7zip_jll]]
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"
julia = "1.9"

File diff suppressed because it is too large Load Diff

View File

@ -8,4 +8,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
[compat]
julia = "1.6"
julia = "1.9"

File diff suppressed because it is too large Load Diff

View File

@ -16,4 +16,4 @@ ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"
julia = "1.9"

View File

@ -1,31 +1,35 @@
# This file is machine-generated - editing it directly is not advised
[[ArgParse]]
julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[Logging]]
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
[[deps.OrderedCollections]]
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
version = "1.6.2"
[[Parameters]]
[[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3"
[[TextWrap]]
[[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[UnPack]]
[[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"

View File

@ -3,4 +3,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"
julia = "1.9"

View File

@ -1,335 +1,441 @@
# This file is machine-generated - editing it directly is not advised
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "01f328e925b86927b3f24c30aee6ecdce5bd28cc"
[[ArgParse]]
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.6.2"
weakdeps = ["StaticArrays"]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[Artifacts]]
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
[[deps.CEnum]]
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
version = "0.4.2"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.11.2"
[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.41.0"
[[CompilerSupportLibraries_jll]]
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"
[[Dates]]
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
[[deps.DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6"
version = "0.9.3"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
version = "0.1.10"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.1.2"
version = "8.8.1"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a"
[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.5"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.13.10"
version = "0.21.4"
[[InteractiveUtils]]
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InverseFunctions]]
deps = ["Test"]
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"
[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
[[deps.IrrationalConstants]]
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"
version = "0.2.2"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
version = "1.5.0"
[[LLVM]]
[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.8"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.7.0"
version = "6.2.1"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.13+0"
version = "0.0.25+0"
[[LibCURL]]
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"
[[LibCURL_jll]]
[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"
[[LibGit2]]
[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"
[[Libdl]]
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
[[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6"
version = "0.3.26"
[[Logging]]
[deps.LogExpFunctions.extensions]
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
[deps.LogExpFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.11"
[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"
[[NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c"
[[deps.NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "9846d87fd254cdaa1879dff93999e1bc32ed2658"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.44.21506+0"
version = "23.17.26241+0"
[[NetworkOptions]]
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[OpenLibm_jll]]
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"
[[deps.OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
version = "0.8.1+0"
[[OpenSpecFun_jll]]
[[deps.OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
[[deps.OrderedCollections]]
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
version = "1.6.2"
[[Parameters]]
[[deps.Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.3"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"
[[Preferences]]
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.0"
[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
version = "1.4.1"
[[Printf]]
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[SHA]]
[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[SPIRV_LLVM_Translator_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
version = "11.0.0+2"
[[deps.SPIRV_LLVM_Translator_unified_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "fe95f28a96975bd1d473e9273873b36402b79a54"
uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
version = "0.3.0+0"
[[SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
[[deps.SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl"]
git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0"
version = "2023.2.0+0"
[[Serialization]]
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150"
[[deps.SpecialFunctions]]
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.8.1"
version = "2.3.1"
[[Statistics]]
[deps.SpecialFunctions.extensions]
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
[deps.SpecialFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.6.4"
weakdeps = ["Statistics"]
[deps.StaticArrays.extensions]
StaticArraysStatisticsExt = "Statistics"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"
[[TOML]]
[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[Tar]]
[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
[[deps.TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.13"
version = "0.5.23"
[[UUIDs]]
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
[[deps.UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "21.2.1+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.8744+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"
[[oneAPI_Level_Zero_Headers_jll]]
[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"
[[deps.gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464"
git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "22.3.0+0"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[[deps.libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "7c0b5fa2ff90d96af106fd4a67ff6923cd3f9cb9"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.13822+0"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"
[[deps.oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LinearAlgebra", "NEO_jll", "Preferences", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "UnsafeAtomicsLLVM", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"]
git-tree-sha1 = "9e6a675faf3ea27d08018c9bd0a03596003ff5cf"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "1.3.0"
[[deps.oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.2.43+0"
version = "1.6.3+0"
[[oneAPI_Level_Zero_Loader_jll]]
[[deps.oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889"
git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.5.0+0"
version = "1.11.0+0"
[[p7zip_jll]]
[[deps.oneAPI_Support_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219"
uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
version = "0.2.2+0"
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

View File

@ -4,4 +4,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"
julia = "1.9"

View File

@ -20,6 +20,18 @@ end
@enum Benchmark All Triad Nstream
function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C}
return @elapsed init_arrays!(data, context, init)
end
function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C}
elapsed = @elapsed begin
result = read_data(data, context)
end
return (elapsed, result)
end
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
timings = Timings(times)
lastSum::T = 0
@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {
end
end
function run_nstream!(
data::StreamData{T,C},
context,
times::Int,
)::Vector{Float64} where {T,C}
function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C}
timings::Vector{Float64} = zeros(times)
for i = 1:times
@inbounds timings[i] = @elapsed nstream!(data, context)
@ -93,9 +101,7 @@ function check_solutions(
error = abs((dot - gold_sum) / gold_sum)
failed = error > 1.0e-8
if failed
println(
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
)
println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum")
end
!failed
end : true
@ -158,7 +164,7 @@ end
const DefaultInit = (0.1, 0.2, 0.0)
const DefaultScalar = 0.4
const Version = "4.0"
const Version = "5.0"
function main()
@ -166,7 +172,7 @@ function main()
parse_options(config)
if config.list
for (i, (_,repr, impl)) in enumerate(devices())
for (i, (_, repr, impl)) in enumerate(devices())
println("[$i] ($impl) $repr")
end
exit(0)
@ -175,9 +181,7 @@ function main()
ds = devices()
# TODO implement substring device match
if config.device < 1 || config.device > length(ds)
error(
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
)
error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed")
else
device = ds[config.device]
end
@ -220,10 +224,10 @@ function main()
end
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
tail = Base.rest(xs)
min = Iterators.minimum(tail)
max = Iterators.maximum(tail)
avg = Iterators.sum(tail) / Iterators.length(tail)
tail = Iterators.rest(xs)
min = Base.minimum(tail)
max = Base.maximum(tail)
avg = Base.sum(tail) / Base.length(tail)
mbps = mega_scale * total_bytes / min
if config.csv
return [
@ -257,16 +261,42 @@ function main()
end
end
function show_init(init::Float64, read::Float64)
setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)]
if config.csv
tabulate(
map(
x -> [
("phase", x[1]),
("n_elements", config.arraysize),
("sizeof", x[3]),
("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]),
("runtime", x[2]),
],
setup,
)...,
)
else
for (name, elapsed, total_bytes) in setup
println(
"$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)",
)
end
end
end
init::Tuple{type,type,type} = DefaultInit
scalar::type = DefaultScalar
GC.enable(false)
(data, context) = make_stream(config.arraysize, scalar, device, config.csv)
init_arrays!(data, context, init)
tInit = run_init_arrays!(data, context, init)
if benchmark == All
(timings, sum) = run_all!(data, context, config.numtimes)
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, sum)
tabulate(
mk_row(timings.copy, "Copy", 2 * array_bytes),
mk_row(timings.mul, "Mul", 2 * array_bytes),
@ -276,13 +306,15 @@ function main()
)
elseif benchmark == Nstream
timings = run_nstream!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
elseif benchmark == Triad
elapsed = run_triad!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
total_bytes = 3 * array_bytes * config.numtimes
bandwidth = mega_scale * (total_bytes / elapsed)
println("Runtime (seconds): $(round(elapsed; digits=5))")
@ -290,7 +322,6 @@ function main()
else
error("Bad benchmark $(benchmark)")
end
GC.enable(true)
if !valid

View File

@ -3,5 +3,6 @@
for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions"
do
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
done
echo "Updating subproject $BACKEND"
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
done

View File

@ -1,4 +1,4 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
@ -14,9 +14,9 @@ KokkosStream<T>::KokkosStream(
{
Kokkos::initialize();
d_a = new Kokkos::View<T*>("d_a", ARRAY_SIZE);
d_b = new Kokkos::View<T*>("d_b", ARRAY_SIZE);
d_c = new Kokkos::View<T*>("d_c", ARRAY_SIZE);
d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
hm_a = new typename Kokkos::View<T*>::HostMirror();
hm_b = new typename Kokkos::View<T*>::HostMirror();
hm_c = new typename Kokkos::View<T*>::HostMirror();
@ -140,7 +140,7 @@ T KokkosStream<T>::dot()
Kokkos::View<T*> a(*d_a);
Kokkos::View<T*> b(*d_b);
T sum = 0.0;
T sum{};
Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp)
{

View File

@ -10,9 +10,6 @@
#include <stdexcept>
#include <Kokkos_Core.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_View.hpp>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Kokkos"

View File

@ -1,32 +1,38 @@
register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++")
register_flag_required(KOKKOS_IN_TREE
register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example:
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
# compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
macro(setup)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
if (EXISTS "${KOKKOS_IN_TREE}")
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos)
else ()
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else()
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif ()
register_append_compiler_and_arch_specific_cxx_flags(
@ -36,5 +42,3 @@ macro(setup)
)
endmacro()

View File

@ -15,7 +15,7 @@
#include <iomanip>
#include <cstring>
#define VERSION_STRING "4.0"
#define VERSION_STRING "5.0"
#include "Stream.h"
@ -49,6 +49,8 @@
#include "SYCLStream2020.h"
#elif defined(OMP)
#include "OMPStream.h"
#elif defined(FUTHARK)
#include "FutharkStream.h"
#endif
// Default size of 2^25
@ -222,10 +224,10 @@ void run()
{
// MiB = 2^20
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
<< " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
<< "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
<< " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
}
else
{
@ -298,12 +300,18 @@ void run()
// Use the OpenMP implementation
stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(FUTHARK)
// Use the Futhark implementation
stream = new FutharkStream<T>(ARRAY_SIZE, deviceIndex);
#endif
auto init1 = std::chrono::high_resolution_clock::now();
stream->init_arrays(startA, startB, startC);
auto init2 = std::chrono::high_resolution_clock::now();
// Result of the Dot kernel, if used.
T sum = 0.0;
T sum{};
std::vector<std::vector<double>> timings;
@ -327,7 +335,54 @@ void run()
std::vector<T> c(ARRAY_SIZE);
auto read1 = std::chrono::high_resolution_clock::now();
stream->read_arrays(a, b, c);
auto read2 = std::chrono::high_resolution_clock::now();
auto initElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(read2 - read1).count();
auto readElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(init2 - init1).count();
auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS;
auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS;
if (output_as_csv)
{
std::cout
<< "phase" << csv_separator
<< "n_elements" << csv_separator
<< "sizeof" << csv_separator
<< ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
<< "runtime" << std::endl;
std::cout
<< "Init" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< initBWps << csv_separator
<< initElapsedS << std::endl;
std::cout
<< "Read" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< readBWps << csv_separator
<< readElapsedS << std::endl;
}
else
{
std::cout << "Init: "
<< std::setw(7)
<< initElapsedS
<< " s (="
<< initBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
std::cout << "Read: "
<< std::setw(7)
<< readElapsedS
<< " s (="
<< readBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
}
check_solution<T>(num_times, a, b, c, sum);
// Display timing results
@ -393,7 +448,7 @@ void run()
<< num_times << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
<< ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
<< *minmax.first << csv_separator
<< *minmax.second << csv_separator
<< average
@ -404,7 +459,7 @@ void run()
std::cout
<< std::left << std::setw(12) << labels[i]
<< std::left << std::setw(12) << std::setprecision(3) <<
((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second
<< std::left << std::setw(12) << std::setprecision(5) << average
@ -415,7 +470,7 @@ void run()
{
// Display timing results
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
if (output_as_csv)
{
@ -461,7 +516,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
T goldA = startA;
T goldB = startB;
T goldC = startC;
T goldSum = 0.0;
T goldSum{};
const T scalar = startScalar;
@ -487,15 +542,15 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
goldSum = goldA * goldB * ARRAY_SIZE;
// Calculate the average error
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); });
errA /= a.size();
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); });
errB /= b.size();
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); });
errC /= c.size();
double errSum = fabs((sum - goldSum)/goldSum);
long double errSum = std::fabs((sum - goldSum)/goldSum);
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
long double epsi = std::numeric_limits<T>::epsilon() * 100.0;
if (errA > epsi)
std::cerr

View File

@ -260,7 +260,7 @@ T OCLStream<T>::dot()
);
cl::copy(queue, d_sum, sums.begin(), sums.end());
T sum = 0.0;
T sum{};
for (T val : sums)
sum += val;

View File

@ -220,7 +220,7 @@ void OMPStream<T>::nstream()
template <class T>
T OMPStream<T>::dot()
{
T sum = 0.0;
T sum{};
#ifdef OMP_TARGET_GPU
int array_size = this->array_size;

View File

@ -131,7 +131,7 @@ T RAJAStream<T>::dot()
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
RAJA::ReduceSum<reduce_policy, T> sum(T{});
forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
{

View File

@ -1,25 +1,26 @@
register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++")
register_flag_required(RAJA_IN_TREE
register_flag_optional(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
")
" "")
register_flag_optional(RAJA_IN_PACKAGE
"Use if Raja is part of a package dependency:
Path to installation" "")
register_flag_optional(TARGET
"Target offload device, implemented values are CPU, NVIDIA"
CPU)
register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
"[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")
"[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the RAJA_ENABLE_CUDA or ENABLE_CUDA flag is specified for RAJA" "")
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
register_flag_optional(CUDA_ARCH
@ -57,7 +58,20 @@ macro(setup)
set(ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE)
if (ENABLE_CUDA)
# RAJA >= v2022.03.0 switched to prefixed variables, we keep the legacy ones for backwards compatibiity
set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
set(RAJA_ENABLE_REPRODUCERS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
set(RAJA_ENABLE_DOCUMENTATION OFF CACHE BOOL "")
set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(RAJA_ENABLE_CUDA ${RAJA_ENABLE_CUDA} CACHE BOOL "" FORCE)
if (ENABLE_CUDA OR RAJA_ENABLE_CUDA)
# RAJA still needs ENABLE_CUDA for internal use, so if either is on, assert both.
set(RAJA_ENABLE_CUDA ON)
set(ENABLE_CUDA ON)
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
if(POLICY CMP0104)
@ -69,6 +83,10 @@ macro(setup)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
list(APPEND CMAKE_CUDA_FLAGS)
# See https://github.com/LLNL/RAJA/pull/1302
# And https://github.com/LLNL/RAJA/pull/1339
set(RAJA_ENABLE_VECTORIZATION OFF CACHE BOOL "")
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
endif ()
@ -76,8 +94,14 @@ macro(setup)
register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
elseif (EXISTS "${RAJA_IN_PACKAGE}")
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
find_package(RAJA REQUIRED)
register_link_library(RAJA)
else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
endif ()

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,25 @@
[package]
name = "rust-stream"
version = "4.0.0"
version = "5.0.0"
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
num-traits = "0.2.14"
structopt = "0.3.13"
tabular = "0.1.4"
rayon = "1.5.1"
crossbeam = "0.8.1"
num_cpus = "1.13.0"
rustversion = "1.0"
libc = "0.2.97"
num-traits = "0.2.15"
structopt = "0.3.26"
tabular = "0.2.0"
rayon = "1.5.3"
crossbeam = "0.8.2"
num_cpus = "1.13.1"
rustversion = "1.0.9"
libc = "0.2.134"
core_affinity = "0.5.10"
colour = "0.6.0"
[dev-dependencies]
rstest = "0.10.0"
rstest = "0.13.0"
[build-dependencies]
rustversion = "1.0"

View File

@ -54,7 +54,7 @@ use_field_init_shorthand = false
force_explicit_abi = true
condense_wildcard_suffixes = false
color = "Auto"
required_version = "1.4.38"
required_version = "1.6.0"
unstable_features = false
disable_all_formatting = false
skip_children = false

View File

@ -174,7 +174,7 @@ where StreamData<T, D, A>: RustStream<T> {
);
}
stream.init_arrays();
let init = stream.run_init_arrays();
let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
let tail = &xs[1..]; // tail only
@ -235,10 +235,47 @@ where StreamData<T, D, A>: RustStream<T> {
};
};
let show_setup = |init: Duration, read: Duration| {
let setup = vec![
("Init", init.as_secs_f64(), 3 * array_bytes),
("Read", read.as_secs_f64(), 3 * array_bytes),
];
if option.csv {
tabulate_all(
setup
.iter()
.map(|(name, elapsed, t_size)| {
vec![
("phase", name.to_string()),
("n_elements", option.arraysize.to_string()),
("sizeof", t_size.to_string()),
(
if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
(mega_scale * (*t_size as f64) / elapsed).to_string(),
),
("runtime", elapsed.to_string()),
]
})
.collect::<Vec<_>>(),
);
} else {
for (name, elapsed, t_size) in setup {
println!(
"{}: {:.5} s (={:.5} {})",
name,
elapsed,
mega_scale * (t_size as f64) / elapsed,
if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }
);
}
}
};
let solutions_correct = match benchmark {
Benchmark::All => {
let (results, sum) = stream.run_all(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum));
tabulate_all(vec![
tabulate(&results.copy, "Copy", 2 * array_bytes),
@ -251,14 +288,16 @@ where StreamData<T, D, A>: RustStream<T> {
}
Benchmark::NStream => {
let results = stream.run_nstream(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None);
tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
correct
}
Benchmark::Triad => {
let results = stream.run_triad(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None);
let total_bytes = 3 * array_bytes * option.numtimes;
let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());

View File

@ -132,6 +132,18 @@ pub trait RustStream<T: Default> {
fn nstream(&mut self);
fn dot(&mut self) -> T;
fn run_init_arrays(&mut self) -> Duration {
timed(|| {
self.init_arrays();
})
}
fn run_read_arrays(&mut self) -> Duration {
timed(|| {
self.read_arrays();
})
}
fn run_all(&mut self, n: usize) -> (AllTiming<Vec<Duration>>, T) {
let mut timings: AllTiming<Vec<Duration>> = AllTiming {
copy: vec![Duration::default(); n],

View File

@ -2,10 +2,10 @@ use rstest::rstest;
#[rstest]
fn test_main(
#[values(0, 1, 2, 3, 4)] device: usize, //
#[values("", "--pin")] pin: &str, //
#[values("", "--malloc")] malloc: &str, //
#[values("", "--init")] init: &str, //
#[values(0, 1, 2, 3, 4)] device: usize, //
#[values("", "--pin")] pin: &str, //
#[values("", "--malloc")] malloc: &str, //
#[values("", "--init")] init: &str, //
#[values("", "--triad-only", "--nstream-only")] option: &str, //
) {
let line = format!(

View File

@ -1 +0,0 @@
{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]}

View File

@ -1 +1,2 @@
target/
.bsp/

View File

@ -1,4 +1,4 @@
version = "3.0.0-RC2"
version = "3.7.14"
runner.dialect = scala3
style = defaultWithAlign

View File

@ -3,14 +3,19 @@ lazy val mainCls = Some("scalastream.App")
lazy val root = (project in file("."))
.enablePlugins(NativeImagePlugin)
.settings(
scalaVersion := "3.0.0",
version := "4.0",
scalaVersion := "3.3.1",
version := "5.0",
organization := "uk.ac.bristol.uob-hpc",
organizationName := "University of Bristol",
Compile / mainClass := mainCls,
assembly / mainClass := mainCls,
scalacOptions ~= filterConsoleScalacOptions,
assembly / assemblyJarName := "scala-stream.jar",
assembly / assemblyMergeStrategy := {
case PathList("module-info.class") => MergeStrategy.discard
case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard
case x => (ThisBuild / assemblyMergeStrategy).value(x)
},
nativeImageOptions := Seq(
"--no-fallback",
"-H:ReflectionConfigurationFiles=../../reflect-config.json"
@ -22,8 +27,8 @@ lazy val root = (project in file("."))
// Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part
("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13),
// par also uses lazy val at some point, so it doesn't work in nativeImage
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3",
"net.openhft" % "affinity" % "3.21ea1",
"org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
"net.openhft" % "affinity" % "3.23.2",
"org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity
)
)

View File

@ -1 +1 @@
sbt.version=1.5.2
sbt.version=1.9.2

View File

@ -1,6 +1,6 @@
addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20")
addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3")

View File

@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def config: Config[A]
def initArrays(): Unit
def readArrays(): Unit = ()
def copy(): Unit
def mul(): Unit
def add(): Unit
@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
val end = System.nanoTime()
FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r
inline def runInitArrays(): FiniteDuration = timed(initArrays())._1
inline def runReadArrays(): FiniteDuration = timed(readArrays())._1
inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) =
val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def data(): Data[A]
trait Fractional[@specialized(Double, Float) A]:
def toFractional(f: Float): A
def toFractional(f: Double): A
@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]:
extension (x: Int) inline def fractional = toFractional(x.toFloat)
extension (x: Long) inline def fractional = toFractional(x.toDouble)
extension (x: A)
inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x)
inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x)
end Fractional
given FloatFractional: Fractional[Float] with
@ -108,7 +110,7 @@ given DoubleFractional: Fractional[Double] with
object App:
final val Version: String = "4.0"
final val Version: String = "5.0"
case class Config[@specialized(Double, Float) A](
options: Options,
@ -204,7 +206,7 @@ object App:
validateXs("c", vec.c, goldC)
dotSum.foreach { sum =>
val goldSum = (goldA * goldB) * (config.options.arraysize).fractional
val goldSum = (goldA * goldB) * config.options.arraysize.fractional
val error = ((sum - goldSum) / goldSum).abs_
if error > 1.fractional / 100000000.fractional then
Console.err.println(
@ -238,10 +240,10 @@ object App:
)
println(s"Running ${config.benchmark match {
case Benchmark.All => "kernels"
case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times")
case Benchmark.All => "kernels"
case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times")
if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}")
@ -288,11 +290,38 @@ object App:
println(header.map(_._1.padTo(padding, ' ')).mkString(sep))
println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n"))
def showInit(init: FiniteDuration, read: FiniteDuration): Unit = {
val setup =
Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes))
if opt.csv then
tabulate(
setup.map((name, elapsed, totalBytes) =>
Vector(
"phase" -> name,
"n_elements" -> opt.arraysize.toString,
"sizeof" -> arrayBytes.toString,
s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" ->
(megaScale * totalBytes.toDouble / elapsed).toString,
"runtime" -> elapsed.toString
)
): _*
)
else
for (name, elapsed, totalBytes) <- setup do
println(
f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${
if opt.mibibytes then "i" else ""
}Bytes/sec)"
)
}
val stream = mkStream(config)
stream.initArrays()
val init = stream.runInitArrays()
config.benchmark match
case Benchmark.All =>
val (results, sum) = stream.runAll(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config, Some(sum))
tabulate(
mkRow(results.copy, "Copy", 2 * arrayBytes),
@ -303,10 +332,14 @@ object App:
)
case Benchmark.NStream =>
val result = stream.runNStream(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config)
tabulate(mkRow(result, "Nstream", 4 * arrayBytes))
case Benchmark.Triad =>
val results = stream.runTriad(opt.numtimes)
val results = stream.runTriad(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
val totalBytes = 3 * arrayBytes * opt.numtimes
val bandwidth = megaScale * (totalBytes / results.seconds)
println(f"Runtime (seconds): ${results.seconds}%.5f")

View File

@ -6,64 +6,76 @@
#include "STDDataStream.h"
#include <algorithm>
#include <execution>
#include <numeric>
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
noexcept : array_size{ARRAY_SIZE},
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDDataStream<T>::~STDDataStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, a.begin(), a.end(), initA);
std::fill(exe_policy, b.begin(), b.end(), initB);
std::fill(exe_policy, c.begin(), c.end(), initC);
std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c, c + array_size, initC);
}
template <class T>
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
h_a = a;
h_b = b;
h_c = c;
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
void STDDataStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin());
std::copy(exe_policy, a, a + array_size, c);
}
template <class T>
void STDDataStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; });
std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
}
template <class T>
void STDDataStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>());
std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
}
template <class T>
void STDDataStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
}
template <class T>
@ -73,8 +85,8 @@ void STDDataStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
}
@ -82,7 +94,7 @@ template <class T>
T STDDataStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
}
void listDevices(void)
@ -101,4 +113,3 @@ std::string getDeviceDriver(const int)
}
template class STDDataStream<float>;
template class STDDataStream<double>;

View File

@ -5,6 +5,7 @@
// source code
#pragma once
#include "dpl_shim.h"
#include <iostream>
#include <stdexcept>
@ -21,14 +22,11 @@ class STDDataStream : public Stream<T>
int array_size;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
T *a, *b, *c;
public:
STDDataStream(const int, int) noexcept;
~STDDataStream() = default;
~STDDataStream();
virtual void copy() override;
virtual void add() override;

View File

@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
ccall - Compile for all supported compute capabilities"
"")
register_flag_optional(USE_TBB
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
DPCPP - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup)
set(CMAKE_CXX_STANDARD 17)
if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()
if (USE_ONEDPL)
register_definitions(USE_ONEDPL)
register_link_library(oneDPL)
endif ()
endmacro()

View File

@ -6,50 +6,66 @@
#include "STDIndicesStream.h"
#include <algorithm>
#include <execution>
#include <numeric>
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDIndicesStream<T>::~STDIndicesStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, a.begin(), a.end(), initA);
std::fill(exe_policy, b.begin(), b.end(), initB);
std::fill(exe_policy, c.begin(), c.end(), initC);
std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c, c + array_size, initC);
}
template <class T>
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
h_a = a;
h_b = b;
h_c = c;
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
void STDIndicesStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin());
std::copy(exe_policy, a, a + array_size, c);
}
template <class T>
void STDIndicesStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
return scalar * c[i];
});
}
@ -58,7 +74,7 @@ template <class T>
void STDIndicesStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
return a[i] + b[i];
});
}
@ -67,7 +83,7 @@ template <class T>
void STDIndicesStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
return b[i] + scalar * c[i];
});
}
@ -79,7 +95,7 @@ void STDIndicesStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
return a[i] + b[i] + scalar * c[i];
});
}
@ -89,7 +105,7 @@ template <class T>
T STDIndicesStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
}
void listDevices(void)
@ -108,4 +124,3 @@ std::string getDeviceDriver(const int)
}
template class STDIndicesStream<float>;
template class STDIndicesStream<double>;

View File

@ -5,6 +5,7 @@
// source code
#pragma once
#include "dpl_shim.h"
#include <iostream>
#include <stdexcept>
@ -12,40 +13,57 @@
#define IMPLEMENTATION_STRING "STD (index-oriented)"
// A lightweight counting iterator which will be used by the STL algorithms
// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
// implementation doesn't target
template <typename N>
class ranged {
N from, to;
public:
ranged(N from, N to ): from(from), to(to) {}
class iterator {
N num;
class iterator {
friend class ranged;
public:
using difference_type = N;
using value_type = N;
using pointer = const N*;
using reference = const N&;
using iterator_category = std::random_access_iterator_tag;
explicit iterator(N _num = 0) : num(_num) {}
using difference_type = N;
using value_type = N;
using pointer = const N*;
using reference = N;
using iterator_category = std::random_access_iterator_tag;
iterator& operator++() { num++; return *this; }
iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
iterator operator+(const value_type v) const { return iterator(num + v); }
// XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
// Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
// This type is unused for any nother STL impl.
using is_passed_directly = std::true_type;
bool operator==(iterator other) const { return num == other.num; }
bool operator!=(iterator other) const { return *this != other; }
bool operator<(iterator other) const { return num < other.num; }
reference operator *() const { return i_; }
iterator &operator ++() { ++i_; return *this; }
iterator operator ++(int) { iterator copy(*this); ++i_; return copy; }
reference operator*() const { return num;}
difference_type operator-(const iterator &it) const { return num - it.num; }
value_type operator[](const difference_type &i) const { return num + i; }
iterator &operator --() { --i_; return *this; }
iterator operator --(int) { iterator copy(*this); --i_; return copy; }
};
iterator begin() { return iterator(from); }
iterator end() { return iterator(to >= from? to+1 : to-1); }
iterator &operator +=(N by) { i_+=by; return *this; }
value_type operator[](const difference_type &i) const { return i_ + i; }
difference_type operator-(const iterator &it) const { return i_ - it.i_; }
iterator operator+(const value_type v) const { return iterator(i_ + v); }
bool operator ==(const iterator &other) const { return i_ == other.i_; }
bool operator !=(const iterator &other) const { return i_ != other.i_; }
bool operator < (const iterator &other) const { return i_ < other.i_; }
protected:
explicit iterator(N start) : i_ (start) {}
private:
N i_;
};
[[nodiscard]] iterator begin() const { return begin_; }
[[nodiscard]] iterator end() const { return end_; }
ranged(N begin, N end) : begin_(begin), end_(end) {}
private:
iterator begin_;
iterator end_;
};
template <class T>
@ -59,14 +77,11 @@ class STDIndicesStream : public Stream<T>
ranged<int> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
T *a, *b, *c;
public:
STDIndicesStream(const int, int) noexcept;
~STDIndicesStream() = default;
~STDIndicesStream();
virtual void copy() override;
virtual void add() override;

View File

@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
ccall - Compile for all supported compute capabilities"
"")
register_flag_optional(USE_TBB
"Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
DPCPP - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup)
set(CMAKE_CXX_STANDARD 17)
if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()
if (USE_ONEDPL)
register_definitions(USE_ONEDPL)
register_link_library(oneDPL)
endif ()
endmacro()

View File

@ -5,25 +5,45 @@
// source code
#include "STDRangesStream.hpp"
#include <algorithm>
#include <execution>
#include <ranges>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
noexcept : array_size{ARRAY_SIZE},
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
{
a = std::vector<T>(array_size);
b = std::vector<T>(array_size);
c = std::vector<T>(array_size);
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if ONEDPL_USE_DPCPP_BACKEND
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif ONEDPL_USE_TBB_BACKEND
std::cout << "TBB " TBB_VERSION_STRING;
#elif ONEDPL_USE_OPENMP_BACKEND
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDRangesStream<T>::~STDRangesStream() {
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size, // loop range
[&] (int i) {
a[i] = initA;
@ -37,16 +57,16 @@ template <class T>
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
void STDRangesStream<T>::copy()
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
c[i] = a[i];
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
b[i] = scalar * c[i];
@ -72,7 +92,7 @@ template <class T>
void STDRangesStream<T>::add()
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
c[i] = a[i] + b[i];
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] = b[i] + scalar * c[i];
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] += b[i] + scalar * c[i];
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
// sum += a[i] * b[i];
return
std::transform_reduce(
std::execution::par_unseq,
a.begin(), a.end(), b.begin(), 0.0);
exe_policy,
a, a + array_size, b, T{});
}
void listDevices(void)
@ -135,4 +155,3 @@ std::string getDeviceDriver(const int)
template class STDRangesStream<float>;
template class STDRangesStream<double>;

Some files were not shown because too many files have changed in this diff Show More