Compare commits
117 Commits
29b2d88aa6
...
2f00dfb7f8
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f00dfb7f8 | ||
|
|
d7231e226a | ||
|
|
ad61a0dba1 | ||
|
|
438e210867 | ||
|
|
06c3d534dd | ||
|
|
78ba4ff37a | ||
|
|
8389fc43a3 | ||
|
|
f6ae48de89 | ||
|
|
7b6ee78ec4 | ||
|
|
f3801aeac2 | ||
|
|
773814f0f2 | ||
|
|
f3aaca06dd | ||
|
|
165db1749c | ||
|
|
01ef17e8b4 | ||
|
|
a27abfe296 | ||
|
|
145e2a0649 | ||
|
|
3cb01e76a8 | ||
|
|
e7774c1372 | ||
|
|
3de019c156 | ||
|
|
971d1e8ac7 | ||
|
|
512a6fac0c | ||
|
|
f2f7f3a3de | ||
|
|
5f3741e404 | ||
|
|
ffae3ba83f | ||
|
|
e347d2ff6c | ||
|
|
9954b7d38c | ||
|
|
92fed7082b | ||
|
|
2e3ebeecab | ||
|
|
369785c96a | ||
|
|
bd6bb09b5d | ||
|
|
29b03be572 | ||
|
|
def6dadfd7 | ||
|
|
42de93076c | ||
|
|
e88043a5af | ||
|
|
177416229c | ||
|
|
5c9cb660ec | ||
|
|
717c40cb3d | ||
|
|
f47d27980f | ||
|
|
5a1be9399c | ||
|
|
154ad9f297 | ||
|
|
aea4e7d2a4 | ||
|
|
a542f3bf67 | ||
|
|
89a84fbbea | ||
|
|
324361aa83 | ||
|
|
a0c5c00c9c | ||
|
|
72be9f6980 | ||
|
|
3dcafd1af1 | ||
|
|
87a38e949d | ||
|
|
3f7bb631e1 | ||
|
|
b25fd755a6 | ||
|
|
28dcf6f962 | ||
|
|
eef3221df6 | ||
|
|
09ad102966 | ||
|
|
6d11c72382 | ||
|
|
d12af1075c | ||
|
|
288d0cb189 | ||
|
|
1d43fcb3e7 | ||
|
|
6e47d341fa | ||
|
|
180bd95ba3 | ||
|
|
c3346318b9 | ||
|
|
50fe7c102a | ||
|
|
e81f6c2889 | ||
|
|
7643de8d09 | ||
|
|
092ee67764 | ||
|
|
893af9f5d0 | ||
|
|
8b862f09b3 | ||
|
|
ebb1176a20 | ||
|
|
696ff6a817 | ||
|
|
6a1122e5a3 | ||
|
|
1d49952d47 | ||
|
|
7e94495da6 | ||
|
|
0df3ae79be | ||
|
|
66491909e4 | ||
|
|
85d80915f6 | ||
|
|
f44cd6fdd2 | ||
|
|
de93c06e78 | ||
|
|
f98aedf64d | ||
|
|
bcf8708f2c | ||
|
|
a075455ad4 | ||
|
|
2c5eee4840 | ||
|
|
1c46f8efd9 | ||
|
|
60c36b68ff | ||
|
|
57c8003621 | ||
|
|
407d6701df | ||
|
|
1d8e383a29 | ||
|
|
370d378fbc | ||
|
|
80853e66e0 | ||
|
|
72335f320e | ||
|
|
ed6206b543 | ||
|
|
aa82e57ba0 | ||
|
|
d56dc956e0 | ||
|
|
1f4bc3fffc | ||
|
|
ecb0464f6c | ||
|
|
5a496a91b2 | ||
|
|
14844ceb56 | ||
|
|
379bc2032c | ||
|
|
0e8b3b4bce | ||
|
|
f77e43c6d5 | ||
|
|
dfb4eb06b2 | ||
|
|
5197a4e561 | ||
|
|
f5513cd69e | ||
|
|
193eaa7fe2 | ||
|
|
37dcdc224c | ||
|
|
a299d613bb | ||
|
|
d6413cc627 | ||
|
|
5f6e714bdd | ||
|
|
1d9cde42b0 | ||
|
|
240962722f | ||
|
|
64dd0d3382 | ||
|
|
0f264081d7 | ||
|
|
b27def135e | ||
|
|
fdb0ef8af8 | ||
|
|
6185d3aca6 | ||
|
|
7b2bd5427c | ||
|
|
e77a34158c | ||
|
|
5645b0290d | ||
|
|
a35c7b4bea |
95
.github/workflows/main.yaml
vendored
95
.github/workflows/main.yaml
vendored
@ -12,12 +12,12 @@ on:
|
||||
jobs:
|
||||
|
||||
test-rust:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/rust/rust-stream
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup project
|
||||
run: rustup install nightly
|
||||
- name: Compile project
|
||||
@ -28,12 +28,12 @@ jobs:
|
||||
run: ./target/release/rust-stream --arraysize 2048
|
||||
|
||||
test-java:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/java/java-stream
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
- name: Test build project
|
||||
run: ./mvnw clean package
|
||||
- name: Test run
|
||||
@ -41,12 +41,12 @@ jobs:
|
||||
run: java -jar target/java-stream.jar --arraysize 2048
|
||||
|
||||
test-julia:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/julia/JuliaStream.jl
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup project
|
||||
run: julia --project -e 'import Pkg; Pkg.instantiate()'
|
||||
- name: Test run PlainStream.jl
|
||||
@ -70,14 +70,22 @@ jobs:
|
||||
|
||||
|
||||
test-cpp:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Maximize build space
|
||||
uses: easimon/maximize-build-space@v8
|
||||
with:
|
||||
root-reserve-mb: 8192
|
||||
swap-size-mb: 512
|
||||
remove-android: 'true'
|
||||
remove-codeql: 'true'
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Cache compiler
|
||||
if: ${{ !env.ACT }}
|
||||
id: prepare-compilers
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ./compilers
|
||||
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
|
||||
@ -90,9 +98,9 @@ jobs:
|
||||
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
|
||||
|
||||
# Enable tmate debugging of manually-triggered workflows if the input option was provided
|
||||
- name: Setup tmate session
|
||||
uses: mxschmitt/action-tmate@v3
|
||||
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
|
||||
# - name: Setup tmate session
|
||||
# uses: mxschmitt/action-tmate@v3
|
||||
# if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.13
|
||||
if: ${{ ! cancelled() }}
|
||||
@ -167,4 +175,65 @@ jobs:
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.18
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile clang @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile nvhpc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile aocc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile aomp @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile hip @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile dpcpp @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile clang @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile nvhpc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile aocc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile aomp @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile hip @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile dpcpp @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}
|
||||
|
||||
test-futhark:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Prepare Futhark compiler
|
||||
uses: diku-dk/install-futhark@HEAD
|
||||
with:
|
||||
version: 'latest'
|
||||
- run: cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=multicore
|
||||
- run: cmake --build build
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -10,12 +10,18 @@ sycl-stream
|
||||
hip-stream
|
||||
tbb-stream
|
||||
|
||||
src/fortran/BabelStream
|
||||
src/fortran/BabelStream.*
|
||||
|
||||
*.o
|
||||
*.bc
|
||||
*.sycl
|
||||
*.tar
|
||||
*.gz
|
||||
*.a
|
||||
*.mod
|
||||
*.cub
|
||||
*.ptx
|
||||
|
||||
KokkosCore_config.*
|
||||
|
||||
|
||||
26
CHANGELOG.md
26
CHANGELOG.md
@ -1,9 +1,33 @@
|
||||
# Changelog
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## Unreleased
|
||||
## [v5.0] - 2023-10-12
|
||||
### Added
|
||||
- Ability to build Kokkos and RAJA versions against existing packages.
|
||||
- Thrust managed memory.
|
||||
- HIP managed memory.
|
||||
- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`.
|
||||
- New implementation in Fortran
|
||||
- New implementation in [Futhark](https://futhark-lang.org/)
|
||||
- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust
|
||||
- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java
|
||||
- JuliaStream.jl published to registry (pending #113)
|
||||
|
||||
### Changed
|
||||
- Fix std-data/std-indices compatibility with oneDPL, NVHPC, and AdaptiveCpp (a.k.a. hipSYCL).
|
||||
- RAJA CUDA CMake build issues resolved.
|
||||
- Kokkos build updates (CXX version upgraded to C++17).
|
||||
- Fix CUDA memory limit check.
|
||||
- Fix CUDA CMake options for `-DMEM` and `-DCMAKE_CUDA_FLAGS`.
|
||||
- Use long double for `check_solution` in case of large problem size.
|
||||
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
|
||||
- Updates to the HIP kernels and API usage.
|
||||
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
|
||||
- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with AdaptiveCpp.
|
||||
- Bumped Julia compat to 1.9
|
||||
- Bumped Scala to 3.3.1
|
||||
- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23)
|
||||
- Upgrade CI to Ubuntu 22.04
|
||||
|
||||
## [v4.0] - 2021-12-22
|
||||
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
|
||||
project(BabelStream VERSION 4.0 LANGUAGES CXX)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif ()
|
||||
|
||||
project(BabelStream VERSION 5.0 LANGUAGES CXX C)
|
||||
|
||||
# uncomment for debugging build issues:
|
||||
#set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
@ -27,8 +31,6 @@ endmacro()
|
||||
# the final executable name
|
||||
set(EXE_NAME babelstream)
|
||||
|
||||
# select default build type
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
# for chrono and some basic CXX features, models can overwrite this if required
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
@ -71,6 +73,75 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
|
||||
# Honor user's CXX_EXTRA_LINK_FLAGS
|
||||
set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
|
||||
|
||||
option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
|
||||
don't explicitly link against TBB is a no-op, see description of your selected
|
||||
model on how this is used." OFF)
|
||||
|
||||
option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
|
||||
FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
|
||||
set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
|
||||
|
||||
if (FETCH_TBB)
|
||||
FetchContent_Declare(
|
||||
TBB
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
|
||||
GIT_TAG "${FETCH_TBB_VERSION}"
|
||||
)
|
||||
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
set(TBB_STRICT OFF)
|
||||
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
|
||||
FetchContent_GetProperties(TBB)
|
||||
if (NOT TBB_POPULATED)
|
||||
FetchContent_Populate(TBB)
|
||||
add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
|
||||
don't explicitly link against DPL is a no-op, see description of your selected
|
||||
model on how this is used." OFF)
|
||||
|
||||
option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
|
||||
FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
|
||||
set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
|
||||
|
||||
if (FETCH_ONEDPL)
|
||||
FetchContent_Declare(
|
||||
oneDPL
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
|
||||
GIT_TAG "${FETCH_ONEDPL_VERSION}"
|
||||
)
|
||||
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
|
||||
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
|
||||
if (ONEDPL_BACKEND STREQUAL "openmp")
|
||||
set(ONEDPL_BACKEND omp)
|
||||
endif ()
|
||||
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
|
||||
FetchContent_GetProperties(oneDPL)
|
||||
if (NOT oneDPL_POPULATED)
|
||||
FetchContent_Populate(oneDPL)
|
||||
if (USE_TBB)
|
||||
macro(find_package NAME)
|
||||
if ("${NAME}" STREQUAL "TBB")
|
||||
message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})")
|
||||
else ()
|
||||
_find_package(${NAME} ${ARGN})
|
||||
endif ()
|
||||
endmacro()
|
||||
endif ()
|
||||
add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
|
||||
# Fixup oneDPL's omission on setting DPCPP definitions.
|
||||
# We do this after the creation of the oneDPL target.
|
||||
if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$")
|
||||
target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1)
|
||||
endif ()
|
||||
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
# include our macros
|
||||
include(cmake/register_models.cmake)
|
||||
|
||||
@ -84,12 +155,14 @@ register_model(hip HIP HIPStream.cpp)
|
||||
register_model(cuda CUDA CUDAStream.cu)
|
||||
register_model(kokkos KOKKOS KokkosStream.cpp)
|
||||
register_model(sycl SYCL SYCLStream.cpp)
|
||||
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
|
||||
register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
|
||||
register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
|
||||
register_model(acc ACC ACCStream.cpp)
|
||||
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
||||
register_model(raja USE_RAJA RAJAStream.cpp)
|
||||
register_model(tbb TBB TBBStream.cpp)
|
||||
register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
|
||||
register_model(futhark FUTHARK FutharkStream.cpp)
|
||||
|
||||
|
||||
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
||||
@ -101,6 +174,12 @@ else ()
|
||||
message(STATUS "Selected model : ${MODEL}")
|
||||
endif ()
|
||||
|
||||
if (MODEL STREQUAL "sycl2020")
|
||||
message(FATAL_ERROR "
|
||||
Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
|
||||
Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
|
||||
endif ()
|
||||
|
||||
# load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL
|
||||
load_model(${MODEL})
|
||||
|
||||
@ -151,6 +230,7 @@ include_directories(src)
|
||||
add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
|
||||
target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
|
||||
target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
|
||||
target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES})
|
||||
|
||||
if (CXX_EXTRA_LIBRARIES)
|
||||
target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})
|
||||
|
||||
@ -38,9 +38,10 @@ BabelStream is currently implemented in the following parallel programming model
|
||||
- C++ Parallel STL
|
||||
- Kokkos
|
||||
- RAJA
|
||||
- SYCL and SYCL 2020
|
||||
- SYCL and SYCL2020 (USM and accessors)
|
||||
- TBB
|
||||
- Thrust (via CUDA or HIP)
|
||||
- Futhark
|
||||
|
||||
This project also contains implementations in alternative languages with different build systems:
|
||||
* Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl)
|
||||
@ -101,7 +102,7 @@ The source for each model's implementations are located in `./src/<model>`.
|
||||
|
||||
Currently available models are:
|
||||
```
|
||||
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
||||
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust;futhark
|
||||
```
|
||||
|
||||
#### Overriding default flags
|
||||
@ -165,7 +166,7 @@ The `MODEL` variant selects one implementation of BabelStream to build.
|
||||
|
||||
Currently available models are:
|
||||
```
|
||||
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
||||
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust
|
||||
```
|
||||
|
||||
### GNU Make
|
||||
|
||||
4
src/.gitignore
vendored
4
src/.gitignore
vendored
@ -16,6 +16,8 @@
|
||||
**/*.gz
|
||||
**/*.a
|
||||
|
||||
**/*.swp
|
||||
|
||||
**/KokkosCore_Config_*
|
||||
|
||||
**/.DS_Store
|
||||
@ -26,4 +28,4 @@ cmake-build-*/
|
||||
CMakeFiles/
|
||||
.idea/
|
||||
.vscode/
|
||||
.directory
|
||||
.directory
|
||||
|
||||
@ -149,7 +149,7 @@ void ACCStream<T>::nstream()
|
||||
template <class T>
|
||||
T ACCStream<T>::dot()
|
||||
{
|
||||
T sum = 0.0;
|
||||
T sum{};
|
||||
|
||||
int array_size = this->array_size;
|
||||
T * restrict a = this->a;
|
||||
|
||||
@ -83,6 +83,8 @@ get() {
|
||||
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
|
||||
echo "$name not found, downloading..."
|
||||
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
|
||||
else
|
||||
echo "$name found, skipping download..."
|
||||
fi
|
||||
fi
|
||||
}
|
||||
@ -92,13 +94,15 @@ get_and_untar() {
|
||||
local pkg_url="$2"
|
||||
if [ "$SETUP" = true ]; then
|
||||
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
|
||||
echo "$name not found, downloading..."
|
||||
echo "$name not found, downloading ($pkg_url)..."
|
||||
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
|
||||
fi
|
||||
echo "Preparing to extract $name ..."
|
||||
tar -xf "$name"
|
||||
echo "$name extracted, deleting archive ..."
|
||||
rm -f "$name" # delete for space
|
||||
else
|
||||
echo "Skipping setup for $name ($pkg_url)..."
|
||||
fi
|
||||
}
|
||||
|
||||
@ -119,10 +123,10 @@ verify_dir_exists() {
|
||||
setup_aocc() {
|
||||
echo "Preparing AOCC"
|
||||
|
||||
local aocc_ver="2.3.0"
|
||||
local aocc_ver="4.0.0"
|
||||
local tarball="aocc-$aocc_ver.tar.xz"
|
||||
# XXX it's actually XZ compressed, so it should be tar.xz
|
||||
local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
|
||||
local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
|
||||
# local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
|
||||
|
||||
get_and_untar "$tarball" "$AOCC_URL"
|
||||
@ -134,20 +138,26 @@ setup_aocc() {
|
||||
|
||||
setup_nvhpc() {
|
||||
echo "Preparing Nvidia HPC SDK"
|
||||
local tarball="nvhpc.tar.gz"
|
||||
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
|
||||
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
|
||||
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
|
||||
local nvhpc_release="2023_231"
|
||||
local cuda_ver="12.0"
|
||||
|
||||
local tarball="nvhpc_$nvhpc_ver.tar.gz"
|
||||
|
||||
local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz"
|
||||
get_and_untar "$tarball" "$url"
|
||||
|
||||
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
|
||||
local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
|
||||
local bin_dir="$sdk_dir/compilers/bin"
|
||||
"$bin_dir/makelocalrc" "$bin_dir" -x
|
||||
"$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
|
||||
|
||||
export_var NVHPC_SDK_DIR "$sdk_dir"
|
||||
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
|
||||
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
|
||||
|
||||
export_var NVHPC_NVCXX "$bin_dir/nvc++"
|
||||
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
|
||||
export_var NVHPC_NVCC "$bin_dir/nvcc"
|
||||
export_var NVHPC_CUDA_VER "$cuda_ver"
|
||||
# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc"
|
||||
|
||||
echo "Installed CUDA versions:"
|
||||
ls "$sdk_dir/cuda"
|
||||
@ -160,7 +170,8 @@ setup_nvhpc() {
|
||||
|
||||
setup_aomp() {
|
||||
echo "Preparing AOMP"
|
||||
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
|
||||
local aomp_ver="18.0-0"
|
||||
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
|
||||
# local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
|
||||
get_and_install_deb "aomp" "aomp" "$AOMP_URL"
|
||||
|
||||
@ -183,9 +194,10 @@ setup_oclcpu() {
|
||||
|
||||
setup_kokkos() {
|
||||
echo "Preparing Kokkos"
|
||||
local kokkos_ver="3.3.01"
|
||||
local kokkos_ver="4.1.00"
|
||||
local tarball="kokkos-$kokkos_ver.tar.gz"
|
||||
|
||||
|
||||
local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
|
||||
# local url="http://localhost:8000/$kokkos_ver.tar.gz"
|
||||
|
||||
@ -197,10 +209,10 @@ setup_kokkos() {
|
||||
|
||||
setup_raja() {
|
||||
echo "Preparing RAJA"
|
||||
local raja_ver="0.13.0"
|
||||
local raja_ver="2023.06.1"
|
||||
local tarball="raja-$raja_ver.tar.gz"
|
||||
|
||||
local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
|
||||
local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
|
||||
# local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
|
||||
|
||||
get_and_untar "$tarball" "$url"
|
||||
@ -211,7 +223,7 @@ setup_raja() {
|
||||
|
||||
setup_tbb() {
|
||||
echo "Preparing TBB"
|
||||
local tbb_ver="2021.2.0"
|
||||
local tbb_ver="2021.9.0"
|
||||
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
|
||||
|
||||
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||
@ -225,9 +237,9 @@ setup_tbb() {
|
||||
|
||||
setup_clang_gcc() {
|
||||
|
||||
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
|
||||
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
|
||||
|
||||
export_var GCC_CXX "$(which g++-10)"
|
||||
export_var GCC_CXX "$(which g++-12)"
|
||||
verify_bin_exists "$GCC_CXX"
|
||||
"$GCC_CXX" --version
|
||||
|
||||
@ -248,7 +260,11 @@ setup_clang_gcc() {
|
||||
}
|
||||
|
||||
setup_rocm() {
|
||||
sudo apt-get install -y -qq rocm-dev rocthrust-dev
|
||||
if [ "$SETUP" = true ]; then
|
||||
sudo apt-get install -y rocm-dev rocthrust-dev
|
||||
else
|
||||
echo "Skipping apt setup for ROCm"
|
||||
fi
|
||||
export_var ROCM_PATH "/opt/rocm"
|
||||
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
|
||||
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
|
||||
@ -259,7 +275,7 @@ setup_rocm() {
|
||||
|
||||
setup_dpcpp() {
|
||||
|
||||
local nightly="20210106"
|
||||
local nightly="20230615"
|
||||
local tarball="dpcpp-$nightly.tar.gz"
|
||||
|
||||
local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
|
||||
@ -276,22 +292,22 @@ setup_dpcpp() {
|
||||
setup_hipsycl() {
|
||||
|
||||
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
|
||||
local hipsycl_ver="0.9.0"
|
||||
local hipsycl_ver="0.9.1"
|
||||
local tarball="v$hipsycl_ver.tar.gz"
|
||||
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
|
||||
|
||||
local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
|
||||
# local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
|
||||
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
|
||||
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
|
||||
|
||||
get_and_untar "$tarball" "$url"
|
||||
|
||||
if [ "$SETUP" = true ]; then
|
||||
local src="$PWD/hipSYCL-$hipsycl_ver"
|
||||
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
|
||||
rm -rf "$src/build"
|
||||
rm -rf "$install_dir"
|
||||
cmake "-B$src/build" "-H$src" \
|
||||
-DCMAKE_C_COMPILER="$(which gcc-10)" \
|
||||
-DCMAKE_CXX_COMPILER="$(which g++-10)" \
|
||||
-DCMAKE_C_COMPILER="$(which gcc-12)" \
|
||||
-DCMAKE_CXX_COMPILER="$(which g++-12)" \
|
||||
-DCMAKE_INSTALL_PREFIX="$install_dir" \
|
||||
-DWITH_ROCM_BACKEND=OFF \
|
||||
-DWITH_CUDA_BACKEND=OFF \
|
||||
@ -306,25 +322,20 @@ setup_hipsycl() {
|
||||
check_size
|
||||
}
|
||||
|
||||
setup_computecpp() {
|
||||
echo "TODO ComputeCpp requires registration+login to download"
|
||||
}
|
||||
|
||||
if [ "${GITHUB_ACTIONS:-false}" = true ]; then
|
||||
echo "Running in GitHub Actions, defaulting to special export"
|
||||
TERM=xterm
|
||||
export TERM=xterm
|
||||
|
||||
# drop the lock in case we got one from a failed run
|
||||
rm /var/lib/dpkg/lock-frontend || true
|
||||
rm /var/cache/apt/archives/lock || true
|
||||
|
||||
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
|
||||
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
||||
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
rm -rf /var/lib/dpkg/lock-frontend || true
|
||||
rm -rf /var/cache/apt/archives/lock || true
|
||||
|
||||
mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y -qq cmake
|
||||
sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
|
||||
|
||||
if [ "$SETUP" = true ]; then
|
||||
echo "Deleting extra packages for space in 2 seconds..."
|
||||
@ -334,6 +345,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
|
||||
sudo apt-get autoremove -y
|
||||
check_size
|
||||
fi
|
||||
sudo apt-get upgrade -qq
|
||||
else
|
||||
echo "Running locally, defaulting to standard export"
|
||||
fi
|
||||
@ -362,6 +374,18 @@ setup_cmake() {
|
||||
verify_bin_exists "$CMAKE_3_18_BIN"
|
||||
"$CMAKE_3_18_BIN" --version
|
||||
|
||||
get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
|
||||
chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
|
||||
export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
|
||||
verify_bin_exists "$CMAKE_3_20_BIN"
|
||||
"$CMAKE_3_20_BIN" --version
|
||||
|
||||
get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
|
||||
chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
|
||||
export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
|
||||
verify_bin_exists "$CMAKE_3_24_BIN"
|
||||
"$CMAKE_3_24_BIN" --version
|
||||
|
||||
check_size
|
||||
|
||||
}
|
||||
@ -379,6 +403,10 @@ if [ "$PARALLEL" = true ]; then
|
||||
setup_tbb &
|
||||
wait
|
||||
else
|
||||
# these need apt
|
||||
setup_clang_gcc
|
||||
setup_rocm
|
||||
setup_hipsycl
|
||||
setup_cmake
|
||||
setup_aocc
|
||||
setup_oclcpu
|
||||
@ -388,10 +416,6 @@ else
|
||||
setup_kokkos
|
||||
setup_raja
|
||||
setup_tbb
|
||||
# these need apt
|
||||
setup_clang_gcc
|
||||
setup_rocm
|
||||
setup_hipsycl
|
||||
fi
|
||||
|
||||
echo "Done!"
|
||||
|
||||
@ -120,9 +120,20 @@ run_build() {
|
||||
# CLANG_OMP_OFFLOAD_NVIDIA=false
|
||||
###
|
||||
|
||||
NV_ARCH_CC="70"
|
||||
AMD_ARCH="gfx_903"
|
||||
NV_ARCH="sm_70"
|
||||
NV_ARCH_CCXY="cuda11.4,cc80"
|
||||
NV_ARCH="sm_${NV_ARCH_CC}"
|
||||
NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
|
||||
|
||||
check_cmake_ver(){
|
||||
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
||||
local required=$1
|
||||
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
build_gcc() {
|
||||
local name="gcc_build"
|
||||
@ -135,49 +146,61 @@ build_gcc() {
|
||||
"./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10
|
||||
fi
|
||||
|
||||
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
||||
run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||
run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||
run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||
for use_onedpl in OFF OPENMP TBB; do
|
||||
case "$use_onedpl" in
|
||||
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;;
|
||||
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
esac
|
||||
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
||||
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
done
|
||||
|
||||
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||
run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
|
||||
run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
|
||||
|
||||
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
||||
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
||||
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
|
||||
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
||||
fi
|
||||
|
||||
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
|
||||
fi
|
||||
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
|
||||
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
|
||||
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
||||
-DENABLE_CUDA=ON \
|
||||
-DTARGET=NVIDIA \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||
-DCUDA_ARCH=$NV_ARCH"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
||||
# -DENABLE_CUDA=ON \
|
||||
# -DTARGET=NVIDIA \
|
||||
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||
# -DCUDA_ARCH=$NV_ARCH"
|
||||
|
||||
|
||||
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
|
||||
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
||||
local required="3.15.0"
|
||||
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||
if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||
# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||
|
||||
# FIXME CUDA Thrust + TBB throws the following error:
|
||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
|
||||
@ -187,9 +210,9 @@ build_gcc() {
|
||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
|
||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
|
||||
|
||||
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
||||
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
||||
else
|
||||
echo "CMake version ${current} < ${required}, skipping Thrust models"
|
||||
echo "Skipping Thrust models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
}
|
||||
@ -207,28 +230,39 @@ build_clang() {
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
|
||||
fi
|
||||
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
|
||||
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
|
||||
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||
run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
||||
|
||||
for use_onedpl in OFF OPENMP TBB; do
|
||||
case "$use_onedpl" in
|
||||
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
|
||||
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
esac
|
||||
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
|
||||
done
|
||||
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
|
||||
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
||||
}
|
||||
|
||||
@ -237,6 +271,7 @@ build_nvhpc() {
|
||||
local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
|
||||
|
||||
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
|
||||
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
|
||||
}
|
||||
@ -254,6 +289,8 @@ build_hip() {
|
||||
local name="hip_build"
|
||||
|
||||
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
|
||||
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED"
|
||||
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT"
|
||||
|
||||
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
|
||||
}
|
||||
@ -275,15 +312,18 @@ build_icpc() {
|
||||
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
|
||||
run_build $name "${ICPC_CXX:?}" omp "$cxx"
|
||||
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
}
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
build_computecpp() {
|
||||
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
|
||||
-DSYCL_COMPILER=COMPUTECPP \
|
||||
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
|
||||
-DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
}
|
||||
|
||||
build_dpcpp() {
|
||||
|
||||
@ -42,41 +42,57 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
|
||||
// Print out device information
|
||||
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
|
||||
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
|
||||
|
||||
#if defined(MANAGED)
|
||||
std::cout << "Memory: MANAGED" << std::endl;
|
||||
#elif defined(PAGEFAULT)
|
||||
std::cout << "Memory: PAGEFAULT" << std::endl;
|
||||
#else
|
||||
std::cout << "Memory: DEFAULT" << std::endl;
|
||||
#endif
|
||||
array_size = ARRAY_SIZE;
|
||||
|
||||
|
||||
// Query device for sensible dot kernel block count
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device_index);
|
||||
check_error();
|
||||
dot_num_blocks = props.multiProcessorCount * 4;
|
||||
|
||||
// Allocate the host array for partial sums for dot kernels
|
||||
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
|
||||
sums = (T*)malloc(sizeof(T) * dot_num_blocks);
|
||||
|
||||
size_t array_bytes = sizeof(T);
|
||||
array_bytes *= ARRAY_SIZE;
|
||||
size_t total_bytes = array_bytes * 4;
|
||||
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
|
||||
|
||||
// Check buffers fit on the device
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, 0);
|
||||
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
|
||||
if (props.totalGlobalMem < total_bytes)
|
||||
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
||||
|
||||
// Create device buffers
|
||||
#if defined(MANAGED)
|
||||
cudaMallocManaged(&d_a, ARRAY_SIZE*sizeof(T));
|
||||
cudaMallocManaged(&d_a, array_bytes);
|
||||
check_error();
|
||||
cudaMallocManaged(&d_b, ARRAY_SIZE*sizeof(T));
|
||||
cudaMallocManaged(&d_b, array_bytes);
|
||||
check_error();
|
||||
cudaMallocManaged(&d_c, ARRAY_SIZE*sizeof(T));
|
||||
cudaMallocManaged(&d_c, array_bytes);
|
||||
check_error();
|
||||
cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||
cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T));
|
||||
check_error();
|
||||
#elif defined(PAGEFAULT)
|
||||
d_a = (T*)malloc(sizeof(T)*ARRAY_SIZE);
|
||||
d_b = (T*)malloc(sizeof(T)*ARRAY_SIZE);
|
||||
d_c = (T*)malloc(sizeof(T)*ARRAY_SIZE);
|
||||
d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS);
|
||||
d_a = (T*)malloc(array_bytes);
|
||||
d_b = (T*)malloc(array_bytes);
|
||||
d_c = (T*)malloc(array_bytes);
|
||||
d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
|
||||
#else
|
||||
cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
|
||||
cudaMalloc(&d_a, array_bytes);
|
||||
check_error();
|
||||
cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
|
||||
cudaMalloc(&d_b, array_bytes);
|
||||
check_error();
|
||||
cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
||||
cudaMalloc(&d_c, array_bytes);
|
||||
check_error();
|
||||
cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||
cudaMalloc(&d_sum, dot_num_blocks*sizeof(T));
|
||||
check_error();
|
||||
#endif
|
||||
}
|
||||
@ -237,7 +253,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const size_t local_i = threadIdx.x;
|
||||
|
||||
tb_sum[local_i] = 0.0;
|
||||
tb_sum[local_i] = {};
|
||||
for (; i < array_size; i += blockDim.x*gridDim.x)
|
||||
tb_sum[local_i] += a[i] * b[i];
|
||||
|
||||
@ -257,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
template <class T>
|
||||
T CUDAStream<T>::dot()
|
||||
{
|
||||
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
||||
dot_kernel<<<dot_num_blocks, TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
||||
check_error();
|
||||
|
||||
#if defined(MANAGED) || defined(PAGEFAULT)
|
||||
cudaDeviceSynchronize();
|
||||
check_error();
|
||||
#else
|
||||
cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost);
|
||||
check_error();
|
||||
#endif
|
||||
|
||||
T sum = 0.0;
|
||||
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
|
||||
for (int i = 0; i < dot_num_blocks; i++)
|
||||
{
|
||||
#if defined(MANAGED) || defined(PAGEFAULT)
|
||||
sum += d_sum[i];
|
||||
|
||||
@ -13,16 +13,9 @@
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
#if defined(PAGEFAULT)
|
||||
#define IMPLEMENTATION_STRING "CUDA - Page Fault"
|
||||
#elif defined(MANAGED)
|
||||
#define IMPLEMENTATION_STRING "CUDA - Managed Memory"
|
||||
#else
|
||||
#define IMPLEMENTATION_STRING "CUDA"
|
||||
#endif
|
||||
#define IMPLEMENTATION_STRING "CUDA"
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
|
||||
template <class T>
|
||||
class CUDAStream : public Stream<T>
|
||||
@ -40,6 +33,8 @@ class CUDAStream : public Stream<T>
|
||||
T *d_c;
|
||||
T *d_sum;
|
||||
|
||||
// Number of blocks for dot kernel
|
||||
int dot_num_blocks;
|
||||
|
||||
public:
|
||||
|
||||
|
||||
@ -29,10 +29,11 @@ macro(setup)
|
||||
endif()
|
||||
|
||||
enable_language(CUDA)
|
||||
register_definitions(MEM=${MEM})
|
||||
register_definitions(${MEM})
|
||||
|
||||
# add -forward-unknown-to-host-compiler for compatibility reasons
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
|
||||
string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
|
||||
|
||||
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
|
||||
# appended later
|
||||
|
||||
76
src/dpl_shim.h
Normal file
76
src/dpl_shim.h
Normal file
@ -0,0 +1,76 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstddef>
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
#ifdef USE_ONEDPL
|
||||
|
||||
// oneDPL C++17 PSTL
|
||||
|
||||
#include <oneapi/dpl/execution>
|
||||
#include <oneapi/dpl/algorithm>
|
||||
#include <oneapi/dpl/numeric>
|
||||
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
const static auto exe_policy = oneapi::dpl::execution::device_policy<>{
|
||||
oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue()); }
|
||||
|
||||
template<typename T>
|
||||
void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
|
||||
|
||||
#else
|
||||
|
||||
// auto exe_policy = dpl::execution::seq;
|
||||
// auto exe_policy = dpl::execution::par;
|
||||
static constexpr auto exe_policy = dpl::execution::par_unseq;
|
||||
#define USE_STD_PTR_ALLOC_DEALLOC
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
// Normal C++17 PSTL
|
||||
|
||||
#include <algorithm>
|
||||
#include <execution>
|
||||
#include <numeric>
|
||||
|
||||
// auto exe_policy = std::execution::seq;
|
||||
// auto exe_policy = std::execution::par;
|
||||
static constexpr auto exe_policy = std::execution::par_unseq;
|
||||
#define USE_STD_PTR_ALLOC_DEALLOC
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef USE_STD_PTR_ALLOC_DEALLOC
|
||||
|
||||
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
// TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete
|
||||
// for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled
|
||||
static cl::sycl::queue queue{cl::sycl::default_selector_v};
|
||||
template <typename T> T *alloc_raw(size_t size) { return cl::sycl::malloc_shared<T>(size, queue); }
|
||||
template <typename T> void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); }
|
||||
|
||||
#else
|
||||
template<typename T>
|
||||
T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); }
|
||||
|
||||
template<typename T>
|
||||
void dealloc_raw(T *ptr) { free(ptr); }
|
||||
#endif
|
||||
|
||||
#endif
|
||||
105
src/fortran/ArrayStream.F90
Normal file
105
src/fortran/ArrayStream.F90
Normal file
@ -0,0 +1,105 @@
|
||||
module ArrayStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=5), parameter :: implementation_name = "Array"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
C = A
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
C = A + B
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
B = scalar * C
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
A = B + scalar * C
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
A = A + B + scalar * C
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
s = dot_product(A,B)
|
||||
end function dot
|
||||
|
||||
end module ArrayStream
|
||||
21
src/fortran/BabelStreamTypes.F90
Normal file
21
src/fortran/BabelStreamTypes.F90
Normal file
@ -0,0 +1,21 @@
|
||||
module BabelStreamTypes
|
||||
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
|
||||
|
||||
implicit none
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
integer, parameter :: StreamRealKind = REAL32
|
||||
character(len=6) :: StreamRealName = "REAL32"
|
||||
#else
|
||||
integer, parameter :: StreamRealKind = REAL64
|
||||
character(len=6) :: StreamRealName = "REAL64"
|
||||
#endif
|
||||
|
||||
#ifdef USE_INT32
|
||||
#warning There is no checking for overflowing INT32, so be careful.
|
||||
integer, parameter :: StreamIntKind = INT32
|
||||
#else
|
||||
integer, parameter :: StreamIntKind = INT64
|
||||
#endif
|
||||
|
||||
end module BabelStreamTypes
|
||||
230
src/fortran/CUDAKernelStream.F90
Normal file
230
src/fortran/CUDAKernelStream.F90
Normal file
@ -0,0 +1,230 @@
|
||||
module CUDAKernelStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=10), parameter :: implementation_name = "CUDAKernel"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_MANAGED
|
||||
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use cudafor
|
||||
implicit none
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use cudafor
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.ge.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
err = cudaSetDevice(dev)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaSetDevice failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
end if
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(r)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64) :: r
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end function dot
|
||||
|
||||
end module CUDAKernelStream
|
||||
309
src/fortran/CUDAStream.F90
Normal file
309
src/fortran/CUDAStream.F90
Normal file
@ -0,0 +1,309 @@
|
||||
module CUDAFortranKernels
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
contains
|
||||
|
||||
attributes(global) subroutine do_copy(n,A,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n)
|
||||
real(kind=REAL64), intent(out) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
C(i) = A(i)
|
||||
endif
|
||||
end subroutine do_copy
|
||||
|
||||
attributes(global) subroutine do_add(n,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n), B(n)
|
||||
real(kind=REAL64), intent(out) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
C(i) = A(i) + B(i)
|
||||
endif
|
||||
end subroutine do_add
|
||||
|
||||
attributes(global) subroutine do_mul(n,scalar,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(out) :: B(n)
|
||||
real(kind=REAL64), intent(in) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
B(i) = scalar * C(i)
|
||||
endif
|
||||
end subroutine do_mul
|
||||
|
||||
attributes(global) subroutine do_triad(n,scalar,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(out) :: A(n)
|
||||
real(kind=REAL64), intent(in) :: B(n), C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
endif
|
||||
end subroutine do_triad
|
||||
|
||||
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(inout) :: A(n)
|
||||
real(kind=REAL64), intent(in) :: B(n), C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
endif
|
||||
end subroutine do_nstream
|
||||
|
||||
#if 0
|
||||
attributes(global) subroutine do_dot(n,A,B,r)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n), B(n)
|
||||
real(kind=REAL64), intent(out) :: r
|
||||
integer(kind=StreamIntKind) :: i
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
end subroutine do_dot
|
||||
#endif
|
||||
|
||||
end module CUDAFortranKernels
|
||||
|
||||
module CUDAStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
use cudafor, only: dim3
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=4), parameter :: implementation_name = "CUDA"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_MANAGED
|
||||
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
type(dim3) :: grid, tblock
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use cudafor
|
||||
implicit none
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use cudafor
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.ge.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
err = cudaSetDevice(dev)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaSetDevice failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
end if
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
! move to separate subroutine later
|
||||
tblock = dim3(128,1,1)
|
||||
grid = dim3(ceiling(real(N)/tblock%x),1,1)
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
use CUDAFortranKernels, only: do_copy
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer :: err
|
||||
call do_copy<<<grid, tblock>>>(N, A, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
use CUDAFortranKernels, only: do_add
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer :: err
|
||||
call do_add<<<grid, tblock>>>(N, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
use CUDAFortranKernels, only: do_mul
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
use CUDAFortranKernels, only: do_triad
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
use CUDAFortranKernels, only: do_nstream
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(r)
|
||||
!use CUDAFortranKernels, only: do_dot
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64) :: r
|
||||
integer :: err
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!call do_dot<<<grid, tblock>>>(N, B, C, r)
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end function dot
|
||||
|
||||
end module CUDAStream
|
||||
139
src/fortran/DoConcurrentStream.F90
Normal file
139
src/fortran/DoConcurrentStream.F90
Normal file
@ -0,0 +1,139 @@
|
||||
module DoConcurrentStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "DoConcurrent"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_DEVICE
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N)
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,C)
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(B,C)
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! reduction omitted because NVF infers it and other compilers do not support
|
||||
s = real(0,kind=REAL64)
|
||||
#ifdef CRAY_THREAD_DOCONCURRENT
|
||||
do i=1,N
|
||||
#else
|
||||
do concurrent (i=1:N) !shared(A,B)
|
||||
#endif
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module DoConcurrentStream
|
||||
109
src/fortran/Makefile
Normal file
109
src/fortran/Makefile
Normal file
@ -0,0 +1,109 @@
|
||||
ifeq ($(COMPILER),nvhpc)
|
||||
include make.inc.nvhpc
|
||||
else ifeq ($(COMPILER),oneapi)
|
||||
include make.inc.oneapi
|
||||
else ifeq ($(COMPILER),gcc)
|
||||
include make.inc.gcc
|
||||
else ifeq ($(COMPILER),amd)
|
||||
include make.inc.amd
|
||||
else ifeq ($(COMPILER),arm)
|
||||
include make.inc.arm
|
||||
else ifeq ($(COMPILER),cray)
|
||||
include make.inc.cray
|
||||
else ifeq ($(COMPILER),fj)
|
||||
include make.inc.fj
|
||||
else
|
||||
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
|
||||
include make.inc.gcc
|
||||
COMPILER=gcc
|
||||
endif
|
||||
|
||||
FCFLAGS += -DVERSION_STRING="5.0"
|
||||
#FCFLAGS += -DUSE_INT32
|
||||
|
||||
ifeq ($(IMPLEMENTATION),DoConcurrent)
|
||||
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
|
||||
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),Array)
|
||||
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
|
||||
IMPLEMENTATION_OBJECT = ArrayStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMP)
|
||||
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
|
||||
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
|
||||
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
|
||||
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
|
||||
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenACCStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),CUDA)
|
||||
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
|
||||
IMPLEMENTATION_OBJECT = CUDAStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),CUDAKernel)
|
||||
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
|
||||
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),Sequential)
|
||||
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
|
||||
IMPLEMENTATION_OBJECT = SequentialStream.o
|
||||
|
||||
else
|
||||
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
|
||||
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
|
||||
IMPLEMENTATION=Sequential
|
||||
IMPLEMENTATION_OBJECT = SequentialStream.o
|
||||
|
||||
endif
|
||||
|
||||
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
|
||||
|
||||
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
|
||||
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
|
||||
|
||||
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
|
||||
$(FC) $(FCFLAGS) -c $<
|
||||
|
||||
%.o: %.F90 BabelStreamTypes.mod
|
||||
$(FC) $(FCFLAGS) -c $<
|
||||
|
||||
clean:
|
||||
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
|
||||
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
|
||||
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
|
||||
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
|
||||
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
|
||||
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
|
||||
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
|
||||
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
|
||||
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
|
||||
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
|
||||
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
|
||||
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
|
||||
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
|
||||
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
|
||||
-rm -f *.modmic *.mod *.o *.cub *.ptx
|
||||
|
||||
realclean: clean
|
||||
-rm -f BabelStream.*
|
||||
144
src/fortran/OpenACCArrayStream.F90
Normal file
144
src/fortran/OpenACCArrayStream.F90
Normal file
@ -0,0 +1,144 @@
|
||||
module OpenACCArrayStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "OpenACCArray"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use openacc
|
||||
implicit none
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use openacc
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call acc_set_device_num(dev, acc_get_device_type())
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$acc enter data create(A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$acc exit data delete(A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
!$acc kernels
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
!$acc end kernels
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
!$acc kernels
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
!$acc end kernels
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
!$acc kernels
|
||||
C = A
|
||||
!$acc end kernels
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
!$acc kernels
|
||||
C = A + B
|
||||
!$acc end kernels
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
B = scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
A = B + scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
A = A + B + scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
!$acc kernels
|
||||
s = dot_product(A,B)
|
||||
!$acc end kernels
|
||||
end function dot
|
||||
|
||||
end module OpenACCArrayStream
|
||||
161
src/fortran/OpenACCStream.F90
Normal file
161
src/fortran/OpenACCStream.F90
Normal file
@ -0,0 +1,161 @@
|
||||
module OpenACCStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=7), parameter :: implementation_name = "OpenACC"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use openacc
|
||||
implicit none
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use openacc
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call acc_set_device_num(dev, acc_get_device_type())
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$acc enter data create(A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$acc exit data delete(A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$acc parallel loop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenACCStream
|
||||
137
src/fortran/OpenMPStream.F90
Normal file
137
src/fortran/OpenMPStream.F90
Normal file
@ -0,0 +1,137 @@
|
||||
module OpenMPStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=6), parameter :: implementation_name = "OpenMP"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp parallel do simd reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPStream
|
||||
162
src/fortran/OpenMPTargetLoopStream.F90
Normal file
162
src/fortran/OpenMPTargetLoopStream.F90
Normal file
@ -0,0 +1,162 @@
|
||||
module OpenMPTargetLoopStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call omp_set_default_device(dev)
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target enter data map(alloc: A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target exit data map(delete: A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! this might need to use a copy API instead...
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp target teams loop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPTargetLoopStream
|
||||
163
src/fortran/OpenMPTargetStream.F90
Normal file
163
src/fortran/OpenMPTargetStream.F90
Normal file
@ -0,0 +1,163 @@
|
||||
module OpenMPTargetStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "OpenMPTarget"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call omp_set_default_device(dev)
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target enter data map(alloc: A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target exit data map(delete: A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! this might need to use a copy API instead...
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
!$omp barrier
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp target teams distribute parallel do simd reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPTargetStream
|
||||
169
src/fortran/OpenMPTaskloopStream.F90
Normal file
169
src/fortran/OpenMPTaskloopStream.F90
Normal file
@ -0,0 +1,169 @@
|
||||
module OpenMPTaskloopStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end function dot
|
||||
|
||||
end module OpenMPTaskloopStream
|
||||
120
src/fortran/OpenMPWorkshareStream.F90
Normal file
120
src/fortran/OpenMPWorkshareStream.F90
Normal file
@ -0,0 +1,120 @@
|
||||
module OpenMPWorkshareStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
!$omp parallel workshare
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
!$omp end parallel workshare
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
!$omp parallel workshare
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
!$omp end parallel workshare
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
!$omp parallel workshare
|
||||
C = A
|
||||
!$omp end parallel workshare
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
!$omp parallel workshare
|
||||
C = A + B
|
||||
!$omp end parallel workshare
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
B = scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
A = B + scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
A = A + B + scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
!$omp parallel workshare
|
||||
s = dot_product(A,B)
|
||||
!$omp end parallel workshare
|
||||
end function dot
|
||||
|
||||
end module OpenMPWorkshareStream
|
||||
130
src/fortran/SequentialStream.F90
Normal file
130
src/fortran/SequentialStream.F90
Normal file
@ -0,0 +1,130 @@
|
||||
module SequentialStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=10), parameter :: implementation_name = "Sequential"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module SequentialStream
|
||||
54
src/fortran/build.sh
Executable file
54
src/fortran/build.sh
Executable file
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
# uncomment to disable GPU targets
|
||||
#HAS_GPU=0
|
||||
|
||||
# Orin
|
||||
#if [ "x${compiler}" == "xgcc" ] ; then
|
||||
# export MCPU=cortex-a78ae
|
||||
#fi
|
||||
#if [ "x${compiler}" == "xarm" ] ; then
|
||||
# export MCPU=cortex-a78
|
||||
#fi
|
||||
|
||||
COMPILERS="gcc"
|
||||
if [ $(which nvfortran) ] ; then
|
||||
COMPILERS="${COMPILERS} nvhpc"
|
||||
fi
|
||||
if [ $(which crayftn) ] ; then
|
||||
COMPILERS="${COMPILERS} cray"
|
||||
fi
|
||||
if [ $(uname -m) == "aarch64" ] ; then
|
||||
if [ $(which armflang) ] ; then
|
||||
COMPILERS="${COMPILERS} arm"
|
||||
fi
|
||||
if [ $(which frt) ] ; then
|
||||
COMPILERS="${COMPILERS} fj"
|
||||
fi
|
||||
elif [ $(uname -m) == "x86_64" ] ; then
|
||||
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
|
||||
COMPILERS="${COMPILERS} oneapi"
|
||||
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
|
||||
. /opt/intel/oneapi/setvars.sh >& /dev/null
|
||||
fi
|
||||
else
|
||||
# ^ this detection can be improved
|
||||
COMPILERS="${COMPILERS} amd"
|
||||
fi
|
||||
fi
|
||||
|
||||
for compiler in ${COMPILERS} ; do
|
||||
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
|
||||
if [ "${HAS_GPU}" != "0" ] ; then
|
||||
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
|
||||
if [ "x${compiler}" == "xnvhpc" ] ; then
|
||||
TARGETS="${TARGETS} CUDA CUDAKernel"
|
||||
fi
|
||||
fi
|
||||
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
|
||||
TARGETS="${TARGETS} OpenACC OpenACCArray"
|
||||
fi
|
||||
for implementation in ${TARGETS} ; do
|
||||
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
|
||||
done
|
||||
done
|
||||
683
src/fortran/main.F90
Normal file
683
src/fortran/main.F90
Normal file
@ -0,0 +1,683 @@
|
||||
module BabelStreamUtil
|
||||
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
integer(kind=StreamIntKind) :: array_size = 33554432
|
||||
integer(kind=StreamIntKind) :: num_times = 100
|
||||
logical :: mibibytes = .false.
|
||||
logical :: use_gigs = .false.
|
||||
logical :: csv = .false.
|
||||
character(len=1), parameter :: csv_sep = ","
|
||||
|
||||
! 1 = All
|
||||
! 2 = Triad
|
||||
! 3 = Nstream
|
||||
integer :: selection = 1
|
||||
|
||||
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
|
||||
|
||||
contains
|
||||
|
||||
function get_wtime() result(t)
|
||||
#if defined(USE_OMP_GET_WTIME)
|
||||
use omp_lib
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
t = omp_get_wtime()
|
||||
#elif defined(USE_CPU_TIME)
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
real :: r
|
||||
call cpu_time(r)
|
||||
t = r
|
||||
#else
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
integer(kind=INT64) :: c, r
|
||||
call system_clock(count = c, count_rate = r)
|
||||
t = real(c,REAL64) / real(r,REAL64)
|
||||
#endif
|
||||
end function get_wtime
|
||||
|
||||
subroutine parseArguments()
|
||||
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream, only: list_devices, set_device
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream, only: list_devices, set_device
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream, only: list_devices, set_device
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream, only: list_devices, set_device
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream, only: list_devices, set_device
|
||||
#endif
|
||||
implicit none
|
||||
integer :: i, argc
|
||||
integer :: arglen,err,pos(2)
|
||||
character(len=64) :: argtmp
|
||||
argc = command_argument_count()
|
||||
do i=1,argc
|
||||
call get_command_argument(i,argtmp,arglen,err)
|
||||
if (err.eq.0) then
|
||||
!
|
||||
! list devices
|
||||
!
|
||||
pos(1) = index(argtmp,"--list")
|
||||
if (pos(1).eq.1) then
|
||||
call list_devices()
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! set device number
|
||||
!
|
||||
pos(1) = index(argtmp,"--device")
|
||||
if (pos(1).eq.1) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
stop
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
block
|
||||
integer :: dev
|
||||
read(argtmp,'(i15)') dev
|
||||
call set_device(dev)
|
||||
end block
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! array size
|
||||
!
|
||||
pos(1) = index(argtmp,"--arraysize")
|
||||
pos(2) = index(argtmp,"-s")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
block
|
||||
integer(kind=INT64) :: big_size
|
||||
read(argtmp,'(i15)') big_size
|
||||
if (big_size .gt. HUGE(array_size)) then
|
||||
print*,'Array size does not fit into integer:'
|
||||
print*,big_size,'>',HUGE(array_size)
|
||||
print*,'Stop using USE_INT32'
|
||||
stop
|
||||
else
|
||||
array_size = INT(big_size,kind=StreamIntKind)
|
||||
endif
|
||||
end block
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! number of iterations
|
||||
!
|
||||
pos(1) = index(argtmp,"--numtimes")
|
||||
pos(2) = index(argtmp,"-n")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
read(argtmp,'(i15)') num_times
|
||||
if (num_times.lt.2) then
|
||||
write(*,'(a)') "Number of times must be 2 or more"
|
||||
stop
|
||||
end if
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! precision
|
||||
!
|
||||
pos(1) = index(argtmp,"--float")
|
||||
if (pos(1).eq.1) then
|
||||
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
|
||||
"to run BabelStream in single precision."
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! selection (All, Triad, Nstream)
|
||||
!
|
||||
pos(1) = index(argtmp,"--triad-only")
|
||||
if (pos(1).eq.1) then
|
||||
selection = 2
|
||||
cycle
|
||||
endif
|
||||
pos(1) = index(argtmp,"--nstream-only")
|
||||
if (pos(1).eq.1) then
|
||||
selection = 3
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! CSV
|
||||
!
|
||||
pos(1) = index(argtmp,"--csv")
|
||||
if (pos(1).eq.1) then
|
||||
csv = .true.
|
||||
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
|
||||
!stop
|
||||
endif
|
||||
!
|
||||
! units
|
||||
!
|
||||
pos(1) = index(argtmp,"--mibibytes")
|
||||
if (pos(1).eq.1) then
|
||||
mibibytes = .true.
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! giga/gibi instead of mega/mebi
|
||||
!
|
||||
pos(1) = index(argtmp,"--gigs")
|
||||
if (pos(1).eq.1) then
|
||||
use_gigs = .true.
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
!
|
||||
!
|
||||
pos(1) = index(argtmp,"--compiler-info")
|
||||
if (pos(1).eq.1) then
|
||||
write(*,'(a)') 'Compiler version: ',compiler_version()
|
||||
write(*,'(a)') 'Compiler options: ',compiler_options()
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! help
|
||||
!
|
||||
pos(1) = index(argtmp,"--help")
|
||||
pos(2) = index(argtmp,"-h")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
call get_command_argument(0,argtmp,arglen,err)
|
||||
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
|
||||
write(*,'(a)') "Options:"
|
||||
write(*,'(a)') " -h --help Print the message"
|
||||
write(*,'(a)') " --list List available devices"
|
||||
write(*,'(a)') " --device INDEX Select device at INDEX"
|
||||
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
|
||||
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
|
||||
!write(*,'(a)') " --float Use floats (rather than doubles)"
|
||||
write(*,'(a)') " --triad-only Only run triad"
|
||||
write(*,'(a)') " --nstream-only Only run nstream"
|
||||
write(*,'(a)') " --csv Output as csv table"
|
||||
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
|
||||
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
|
||||
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
|
||||
stop
|
||||
endif
|
||||
end if
|
||||
end do
|
||||
end subroutine parseArguments
|
||||
|
||||
subroutine run_all(timings, summ)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64), intent(out) :: summ
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call copy()
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call mul(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(2,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call add()
|
||||
t2 = get_wtime()
|
||||
timings(3,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call triad(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(4,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
summ = dot()
|
||||
t2 = get_wtime()
|
||||
timings(5,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_all
|
||||
|
||||
subroutine run_triad(timings)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call triad(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_triad
|
||||
|
||||
subroutine run_nstream(timings)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call nstream(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_nstream
|
||||
|
||||
subroutine check_solution(A, B, C, summ)
|
||||
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
|
||||
real(kind=REAL64), intent(in) :: summ
|
||||
|
||||
integer(kind=StreamIntKind) :: i
|
||||
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
|
||||
real(kind=REAL64) :: scalar
|
||||
|
||||
! always use double because of accumulation error
|
||||
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
|
||||
logical :: cleanA, cleanB, cleanC, cleanSum
|
||||
|
||||
goldA = startA
|
||||
goldB = startB
|
||||
goldC = startC
|
||||
goldSum = 0.0d0
|
||||
|
||||
scalar = startScalar
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
if (selection.eq.1) then
|
||||
goldC = goldA
|
||||
goldB = scalar * goldC
|
||||
goldC = goldA + goldB
|
||||
goldA = goldB + scalar * goldC
|
||||
else if (selection.eq.2) then
|
||||
goldA = goldB + scalar * goldC
|
||||
else if (selection.eq.3) then
|
||||
goldA = goldA + goldB + scalar * goldC;
|
||||
endif
|
||||
|
||||
end do
|
||||
|
||||
goldSum = goldA * goldB * array_size
|
||||
|
||||
cleanA = ALL(IEEE_Is_Normal(A))
|
||||
cleanB = ALL(IEEE_Is_Normal(B))
|
||||
cleanC = ALL(IEEE_Is_Normal(C))
|
||||
cleanSum = IEEE_Is_Normal(summ)
|
||||
|
||||
if (.not. cleanA) then
|
||||
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanB) then
|
||||
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanC) then
|
||||
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanSum) then
|
||||
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
|
||||
end if
|
||||
|
||||
errA = SUM( ABS( A - goldA ) ) / array_size
|
||||
errB = SUM( ABS( B - goldB ) ) / array_size
|
||||
errC = SUM( ABS( C - goldC ) ) / array_size
|
||||
errSum = ABS( (summ - goldSum) / goldSum)
|
||||
|
||||
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
|
||||
|
||||
if (errA .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
|
||||
end if
|
||||
if (errB .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
|
||||
end if
|
||||
if (errC .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
|
||||
end if
|
||||
|
||||
if (selection.eq.1) then
|
||||
if (errSum .gt. 1.0e-8) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
|
||||
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
|
||||
end if
|
||||
endif
|
||||
|
||||
end subroutine check_solution
|
||||
|
||||
end module BabelStreamUtil
|
||||
|
||||
program BabelStream
|
||||
use BabelStreamUtil
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
integer :: element_size, err
|
||||
real(kind=REAL64) :: scaling
|
||||
character(len=3) :: label
|
||||
real(kind=REAL64), allocatable :: timings(:,:)
|
||||
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
|
||||
real(kind=REAL64) :: summ
|
||||
real(kind=REAL64) :: init_tic, init_toc, read_tic, read_toc
|
||||
|
||||
call parseArguments()
|
||||
|
||||
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
|
||||
|
||||
if (mibibytes) then
|
||||
if (use_gigs) then
|
||||
scaling = 2.0d0**(-30)
|
||||
label = "GiB"
|
||||
else
|
||||
scaling = 2.0d0**(-20)
|
||||
label = "MiB"
|
||||
endif
|
||||
else
|
||||
if (use_gigs) then
|
||||
scaling = 1.0d-9
|
||||
label = "GB"
|
||||
else
|
||||
scaling = 1.0d-6
|
||||
label = "MB"
|
||||
endif
|
||||
endif
|
||||
|
||||
if (.not.csv) then
|
||||
|
||||
write(*,'(a)') "BabelStream Fortran"
|
||||
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
|
||||
write(*,'(a16,a)') "Implementation: ", implementation_name
|
||||
|
||||
block
|
||||
character(len=32) :: printout
|
||||
write(printout,'(i9,1x,a5)') num_times,'times'
|
||||
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
|
||||
end block
|
||||
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
|
||||
|
||||
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
|
||||
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
|
||||
|
||||
endif ! csv
|
||||
|
||||
allocate( timings(5,num_times) )
|
||||
|
||||
call alloc(array_size)
|
||||
|
||||
init_tic = get_wtime()
|
||||
call init_arrays(startA, startB, startC)
|
||||
init_toc = get_wtime()
|
||||
summ = 0.0d0
|
||||
|
||||
if (.not.csv) then
|
||||
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Init: ',init_toc-init_tic, 's (=', &
|
||||
(3.0d0 * element_size * array_size * scaling) / (init_toc-init_tic), TRIM(label), 'ytes/sec)'
|
||||
end if
|
||||
|
||||
|
||||
timings = -1.0d0
|
||||
if (selection.eq.1) then
|
||||
call run_all(timings, summ)
|
||||
else if (selection.eq.2) then
|
||||
call run_triad(timings)
|
||||
else if (selection.eq.3) then
|
||||
call run_nstream(timings)
|
||||
endif
|
||||
|
||||
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
|
||||
read_tic = get_wtime()
|
||||
call read_arrays(h_A, h_B, h_C)
|
||||
read_toc = get_wtime()
|
||||
|
||||
if (.not.csv) then
|
||||
write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Read: ',read_toc-read_tic, 's (=', &
|
||||
(3.0d0 * element_size * array_size * scaling) / (read_toc-read_tic), TRIM(label), 'ytes/sec)'
|
||||
end if
|
||||
|
||||
call check_solution(h_A, h_B, h_C, summ)
|
||||
|
||||
block
|
||||
character(len=20) :: printout(8)
|
||||
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
|
||||
|
||||
if (csv) then
|
||||
write(*,'(a,a1)',advance='no') 'function', csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
|
||||
if (mibibytes) then
|
||||
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
|
||||
else
|
||||
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
|
||||
endif
|
||||
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
|
||||
write(*,'(a,a1)',advance='yes') 'avg_runtime'
|
||||
else
|
||||
write(printout(1),'(a8)') 'Function'
|
||||
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
|
||||
write(printout(3),'(a9)') 'Min (sec)'
|
||||
write(printout(4),'(a3)') 'Max'
|
||||
write(printout(5),'(a7)') 'Average'
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif ! csv
|
||||
|
||||
if (selection.eq.1) then
|
||||
block
|
||||
integer, parameter :: sizes(5) = [2,2,3,3,2]
|
||||
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
|
||||
integer :: i
|
||||
do i=1,5
|
||||
tmin = MINVAL(timings(i,2:num_times))
|
||||
tmax = MAXVAL(timings(i,2:num_times))
|
||||
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
|
||||
write(printout(1),'(a)') labels(i)
|
||||
if (csv) then
|
||||
write(printout(2),'(i20)') num_times
|
||||
write(printout(3),'(i20)') array_size
|
||||
write(printout(4),'(i20)') element_size
|
||||
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
|
||||
write(printout(6),'(f20.8)') tmin
|
||||
write(printout(7),'(f20.8)') tmax
|
||||
write(printout(8),'(f20.8)') tavg
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
|
||||
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
|
||||
else
|
||||
write(printout(2),'(f12.3)') scaling*nbytes/tmin
|
||||
write(printout(3),'(f12.5)') tmin
|
||||
write(printout(4),'(f12.5)') tmax
|
||||
write(printout(5),'(f12.5)') tavg
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif
|
||||
enddo
|
||||
end block
|
||||
else if ((selection.eq.2).or.(selection.eq.3)) then
|
||||
tmin = MINVAL(timings(1,2:num_times))
|
||||
tmax = MAXVAL(timings(1,2:num_times))
|
||||
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
|
||||
if (selection.eq.2) then
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
|
||||
write(printout(1),'(a12)') "Triad"
|
||||
else if (selection.eq.3) then
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
|
||||
write(printout(1),'(a12)') "Nstream"
|
||||
endif
|
||||
if (csv) then
|
||||
write(printout(2),'(i20)') num_times
|
||||
write(printout(3),'(i20)') array_size
|
||||
write(printout(4),'(i20)') element_size
|
||||
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
|
||||
write(printout(6),'(f20.8)') tmin
|
||||
write(printout(7),'(f20.8)') tmax
|
||||
write(printout(8),'(f20.8)') tavg
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
|
||||
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
|
||||
else
|
||||
write(printout(2),'(f12.3)') scaling*nbytes/tmin
|
||||
write(printout(3),'(f12.5)') tmin
|
||||
write(printout(4),'(f12.5)') tmax
|
||||
write(printout(5),'(f12.5)') tavg
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif
|
||||
endif
|
||||
end block
|
||||
|
||||
call dealloc()
|
||||
|
||||
end program BabelStream
|
||||
25
src/fortran/make.inc.amd
Normal file
25
src/fortran/make.inc.amd
Normal file
@ -0,0 +1,25 @@
|
||||
FC := /opt/rocm/llvm/bin/flang
|
||||
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
|
||||
FCFLAGS := -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-variable
|
||||
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
|
||||
ARRAY_FLAG = -fopenmp # libomp.so required
|
||||
OPENMP_FLAG = -fopenmp
|
||||
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
39
src/fortran/make.inc.arm
Normal file
39
src/fortran/make.inc.arm
Normal file
@ -0,0 +1,39 @@
|
||||
FC = armflang
|
||||
FCFLAGS = -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-variable
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
|
||||
ARCH=$(shell uname -m)
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifdef MCPU
|
||||
FCFLAGS += -mcpu=$(MCPU)
|
||||
else
|
||||
FCFLAGS += -mcpu=native
|
||||
endif
|
||||
else
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -fopenmp
|
||||
ARRAY_FLAG = -fopenmp
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
18
src/fortran/make.inc.cray
Normal file
18
src/fortran/make.inc.cray
Normal file
@ -0,0 +1,18 @@
|
||||
FC := ftn
|
||||
FCFLAGS = -e F -O3
|
||||
|
||||
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
|
||||
ARRAY_FLAG = -h autothread
|
||||
OPENMP_FLAG = -h omp
|
||||
OPENACC_FLAG = -h acc
|
||||
# CPU only
|
||||
OPENACC_FLAG += -h omp
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
21
src/fortran/make.inc.fj
Normal file
21
src/fortran/make.inc.fj
Normal file
@ -0,0 +1,21 @@
|
||||
FC := frt
|
||||
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
|
||||
|
||||
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
|
||||
ARRAY_FLAG = -Kparallel,reduction
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG =
|
||||
# CPU only
|
||||
OPENACC_FLAG +=
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OPENACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
33
src/fortran/make.inc.gcc
Normal file
33
src/fortran/make.inc.gcc
Normal file
@ -0,0 +1,33 @@
|
||||
FC = gfortran
|
||||
FCFLAGS = -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
|
||||
ARCH=$(shell uname -m)
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifdef MCPU
|
||||
FCFLAGS += -mcpu=$(MCPU)
|
||||
else
|
||||
FCFLAGS += -mcpu=native
|
||||
endif
|
||||
else
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
|
||||
ARRAY_FLAG =
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
70
src/fortran/make.inc.nvhpc
Normal file
70
src/fortran/make.inc.nvhpc
Normal file
@ -0,0 +1,70 @@
|
||||
FC := nvfortran
|
||||
#FCFLAGS := -O3 -Minform=inform -Minfo=all
|
||||
FCFLAGS := -O3 -Minform=warn
|
||||
|
||||
#TARGET=gpu
|
||||
TARGET=multicore
|
||||
|
||||
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
|
||||
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
|
||||
$(info Ampere detected)
|
||||
GPU = cc80
|
||||
endif
|
||||
ifeq ($(findstring Turing,$(NVARCH)),Turing)
|
||||
$(info Turing detected)
|
||||
GPU = cc75
|
||||
endif
|
||||
ifeq ($(findstring Volta,$(NVARCH)),Volta)
|
||||
$(info Volta detected)
|
||||
GPU = cc70
|
||||
endif
|
||||
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
|
||||
$(info Pascal detected)
|
||||
GPU = cc60,cc61
|
||||
endif
|
||||
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
|
||||
$(info Jetson AGX Orin detected)
|
||||
GPU = ccn87,cc86
|
||||
# figure out Xavier later
|
||||
#GPU = cc72
|
||||
endif
|
||||
ifeq ($(GPU),)
|
||||
$(error Your GPU architecture could not be detected. Set it manually.)
|
||||
endif
|
||||
GPUFLAG = -gpu=$(GPU)
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,zen3
|
||||
ARCH=$(shell uname -m)
|
||||
ifdef MARCH
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifeq ($(MARCH),neoverse-n1)
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
else
|
||||
ifeq ($(MARCH),neoverse-v1)
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -tp=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
endif
|
||||
else
|
||||
FCFLAGS += -tp=native
|
||||
endif
|
||||
|
||||
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
|
||||
# set exactly one of these!
|
||||
#MANAGED = -DUSE_MANAGED -gpu=managed
|
||||
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
|
||||
|
||||
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
|
||||
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
|
||||
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
|
||||
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
|
||||
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
|
||||
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
|
||||
endif
|
||||
32
src/fortran/make.inc.oneapi
Normal file
32
src/fortran/make.inc.oneapi
Normal file
@ -0,0 +1,32 @@
|
||||
FC := ifx
|
||||
FCFLAGS = -std18
|
||||
FCFLAGS += -Ofast -xHOST
|
||||
FCFLAGS += -qopt-zmm-usage=low
|
||||
|
||||
ifeq ($(FC),ifort)
|
||||
FCFLAGS += -qopt-streaming-stores=always
|
||||
PARALLEL = -parallel
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
|
||||
ARRAY_FLAG = -qopenmp $(PARALLEL)
|
||||
OPENMP_FLAG = -qopenmp
|
||||
ifeq ($(FC),ifx)
|
||||
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
|
||||
endif
|
||||
OPENACC_FLAG =
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
35
src/fortran/run.sh
Executable file
35
src/fortran/run.sh
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat ./run.sh
|
||||
|
||||
if [ `uname -s` == Darwin ] ; then
|
||||
NUM_HWTHREADS=`sysctl -n hw.ncpu`
|
||||
MEMORY_BYTES=`sysctl -n hw.memsize`
|
||||
else
|
||||
NUM_HWTHREADS=`nproc`
|
||||
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
|
||||
fi
|
||||
|
||||
M=128
|
||||
|
||||
export OMP_NUM_THREADS=8
|
||||
export OMP_PROC_BIND=close
|
||||
export OMP_PLACES=threads
|
||||
|
||||
export ACC_NUM_CORES=${OMP_NUM_THREADS}
|
||||
|
||||
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
|
||||
|
||||
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
|
||||
#if [ "x$compiler" == "xgcc" ] ; then
|
||||
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
|
||||
#fi
|
||||
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
|
||||
if [ -f BabelStream.${compiler}.${implementation} ] ; then
|
||||
echo "BabelStream.${compiler}.${implementation}"
|
||||
ldd BabelStream.${compiler}.${implementation}
|
||||
time $AFFCONTROL \
|
||||
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
|
||||
fi
|
||||
done
|
||||
done
|
||||
212
src/futhark/FutharkStream.cpp
Normal file
212
src/futhark/FutharkStream.cpp
Normal file
@ -0,0 +1,212 @@
|
||||
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
|
||||
// University of Bristol HPC
|
||||
// Copyright (c) 2022 Troels Henriksen
|
||||
// University of Copenhagen
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#include <cstdlib> // For aligned_alloc
|
||||
#include <string>
|
||||
#include "FutharkStream.h"
|
||||
|
||||
template <class T>
|
||||
FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
|
||||
{
|
||||
this->array_size = ARRAY_SIZE;
|
||||
this->cfg = futhark_context_config_new();
|
||||
this->device = "#" + std::to_string(device);
|
||||
#if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
|
||||
futhark_context_config_set_device(cfg, this->device.c_str());
|
||||
#endif
|
||||
this->ctx = futhark_context_new(cfg);
|
||||
this->a = NULL;
|
||||
this->b = NULL;
|
||||
this->c = NULL;
|
||||
}
|
||||
|
||||
template <>
|
||||
FutharkStream<float>::~FutharkStream()
|
||||
{
|
||||
if (this->a) {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->a);
|
||||
}
|
||||
if (this->b) {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
|
||||
}
|
||||
if (this->c) {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
|
||||
}
|
||||
futhark_context_free(this->ctx);
|
||||
futhark_context_config_free(this->cfg);
|
||||
}
|
||||
|
||||
template <>
|
||||
FutharkStream<double>::~FutharkStream()
|
||||
{
|
||||
if (this->a) {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
|
||||
}
|
||||
if (this->b) {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
|
||||
}
|
||||
if (this->c) {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
|
||||
}
|
||||
futhark_context_free(this->ctx);
|
||||
futhark_context_config_free(this->cfg);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::init_arrays(float initA, float initB, float initC) {
|
||||
int array_size = this->array_size;
|
||||
float *a = new float[array_size];
|
||||
float *b = new float[array_size];
|
||||
float *c = new float[array_size];
|
||||
for (int i = 0; i < array_size; i++) {
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
this->a = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, a, array_size);
|
||||
this->b = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, b, array_size);
|
||||
this->c = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, c, array_size);
|
||||
futhark_context_sync(this->ctx);
|
||||
delete[] a;
|
||||
delete[] b;
|
||||
delete[] c;
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::init_arrays(double initA, double initB, double initC) {
|
||||
int array_size = this->array_size;
|
||||
double *a = new double[array_size];
|
||||
double *b = new double[array_size];
|
||||
double *c = new double[array_size];
|
||||
for (int i = 0; i < array_size; i++) {
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
this->a = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, a, array_size);
|
||||
this->b = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, b, array_size);
|
||||
this->c = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, c, array_size);
|
||||
futhark_context_sync(this->ctx);
|
||||
delete[] a;
|
||||
delete[] b;
|
||||
delete[] c;
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
|
||||
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
|
||||
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
|
||||
futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
|
||||
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
|
||||
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
|
||||
futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::copy() {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
|
||||
futhark_entry_f32_copy(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::copy() {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
|
||||
futhark_entry_f64_copy(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::mul() {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b);
|
||||
futhark_entry_f32_mul(this->ctx, (futhark_f32_1d**)&this->b, (futhark_f32_1d*)this->c);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::mul() {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b);
|
||||
futhark_entry_f64_mul(this->ctx, (futhark_f64_1d**)&this->b, (futhark_f64_1d*)this->c);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::add() {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
|
||||
futhark_entry_f32_add(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::add() {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
|
||||
futhark_entry_f64_add(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::triad() {
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
|
||||
futhark_entry_f32_triad(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::triad() {
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a);
|
||||
futhark_entry_f64_triad(this->ctx, (futhark_f64_1d**)&this->a, (futhark_f64_1d*)this->b, (futhark_f64_1d*)this->c);
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<float>::nstream() {
|
||||
futhark_f32_1d* d;
|
||||
futhark_entry_f32_triad(this->ctx, &d, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
|
||||
futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c);
|
||||
this->c = d;
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FutharkStream<double>::nstream() {
|
||||
futhark_f64_1d* d;
|
||||
futhark_entry_f64_triad(this->ctx, &d, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
|
||||
futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c);
|
||||
this->c = d;
|
||||
futhark_context_sync(this->ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
float FutharkStream<float>::dot() {
|
||||
float res;
|
||||
futhark_entry_f32_dot(this->ctx, &res, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b);
|
||||
futhark_context_sync(this->ctx);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
double FutharkStream<double>::dot() {
|
||||
double res;
|
||||
futhark_entry_f64_dot(this->ctx, &res, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b);
|
||||
futhark_context_sync(this->ctx);
|
||||
return res;
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
std::cout << "Device selection not supported." << std::endl;
|
||||
}
|
||||
|
||||
template class FutharkStream<float>;
|
||||
template class FutharkStream<double>;
|
||||
60
src/futhark/FutharkStream.h
Normal file
60
src/futhark/FutharkStream.h
Normal file
@ -0,0 +1,60 @@
|
||||
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
|
||||
// University of Bristol HPC
|
||||
// Copyright (c) 2022 Troels Henriksen
|
||||
// University of Copenhagen
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "Stream.h"
|
||||
#include "babelstream.h"
|
||||
|
||||
#if defined(FUTHARK_BACKEND_c)
|
||||
#define IMPLEMENTATION_STRING "Futhark (sequential)"
|
||||
#elif defined(FUTHARK_BACKEND_multicore)
|
||||
#define IMPLEMENTATION_STRING "Futhark (parallel CPU)"
|
||||
#elif defined(FUTHARK_BACKEND_opencl)
|
||||
#define IMPLEMENTATION_STRING "Futhark (OpencL)"
|
||||
#elif defined(FUTHARK_BACKEND_cuda)
|
||||
#define IMPLEMENTATION_STRING "Futhark (CUDA)"
|
||||
#else
|
||||
#define IMPLEMENTATION_STRING "Futhark (unknown backend)"
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
class FutharkStream : public Stream<T>
|
||||
{
|
||||
protected:
|
||||
// Size of arrays
|
||||
int array_size;
|
||||
// For device selection.
|
||||
std::string device;
|
||||
|
||||
// Futhark stuff
|
||||
struct futhark_context_config *cfg;
|
||||
struct futhark_context *ctx;
|
||||
|
||||
// Device side arrays
|
||||
void* a;
|
||||
void* b;
|
||||
void* c;
|
||||
|
||||
public:
|
||||
FutharkStream(const int, int);
|
||||
~FutharkStream();
|
||||
|
||||
virtual void copy() override;
|
||||
virtual void add() override;
|
||||
virtual void mul() override;
|
||||
virtual void triad() override;
|
||||
virtual void nstream() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
};
|
||||
62
src/futhark/babelstream.fut
Normal file
62
src/futhark/babelstream.fut
Normal file
@ -0,0 +1,62 @@
|
||||
module type kernels = {
|
||||
type t
|
||||
val copy [n] : [n]t -> *[n]t
|
||||
val mul [n] : t -> [n]t -> [n]t
|
||||
val add [n] : [n]t -> [n]t -> [n]t
|
||||
val triad [n] : t -> [n]t -> [n]t -> [n]t
|
||||
val dot [n] : [n]t -> [n]t -> t
|
||||
-- Uniqueness allows nstream to mutate the 'a' array.
|
||||
val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t
|
||||
}
|
||||
|
||||
module kernels (P: real) : kernels with t = P.t = {
|
||||
type t = P.t
|
||||
def copy = copy
|
||||
def mul scalar c = map (P.*scalar) c
|
||||
def add = map2 (P.+)
|
||||
def triad scalar b c = map2 (P.+) b (map (P.* scalar) c)
|
||||
def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b)
|
||||
def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c))
|
||||
}
|
||||
|
||||
module f32_kernels = kernels f32
|
||||
def f32_start_scalar : f32 = 0.4
|
||||
entry f32_copy = f32_kernels.copy
|
||||
entry f32_mul = f32_kernels.mul f32_start_scalar
|
||||
entry f32_add = f32_kernels.add
|
||||
entry f32_triad = f32_kernels.triad f32_start_scalar
|
||||
entry f32_nstream = f32_kernels.nstream f32_start_scalar
|
||||
entry f32_dot = f32_kernels.dot
|
||||
|
||||
module f64_kernels = kernels f64
|
||||
def f64_start_scalar : f64 = 0.4
|
||||
entry f64_copy = f64_kernels.copy
|
||||
entry f64_mul = f64_kernels.mul f64_start_scalar
|
||||
entry f64_add = f64_kernels.add
|
||||
entry f64_triad = f64_kernels.triad f64_start_scalar
|
||||
entry f64_nstream = f64_kernels.nstream f64_start_scalar
|
||||
entry f64_dot = f64_kernels.dot
|
||||
|
||||
-- ==
|
||||
-- entry: f32_copy f32_mul
|
||||
-- random input { [33554432]f32 }
|
||||
|
||||
-- ==
|
||||
-- entry: f32_add f32_dot f32_triad
|
||||
-- random input { [33554432]f32 [33554432]f32 }
|
||||
|
||||
-- ==
|
||||
-- entry: f32_nstream
|
||||
-- random input { [33554432]f32 [33554432]f32 [33554432]f32 }
|
||||
|
||||
-- ==
|
||||
-- entry: f64_copy f64_mul
|
||||
-- random input { [33554432]f64 }
|
||||
|
||||
-- ==
|
||||
-- entry: f64_add f64_dot f64_triad
|
||||
-- random input { [33554432]f64 [33554432]f64 }
|
||||
|
||||
-- ==
|
||||
-- entry: f64_nstream
|
||||
-- random input { [33554432]f64 [33554432]f64 [33554432]f64 }
|
||||
55
src/futhark/model.cmake
Normal file
55
src/futhark/model.cmake
Normal file
@ -0,0 +1,55 @@
|
||||
# Use
|
||||
#
|
||||
# cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark
|
||||
#
|
||||
# to use the Futhark backend, where 'foo' must be one of 'multicore',
|
||||
# 'c', 'opencl', or 'cuda'. Defaults to 'multicore'.
|
||||
#
|
||||
# Use -DFUTHARK_COMPILER to set the path to the Futhark compiler
|
||||
# binary. Defaults to 'futhark' on the PATH.
|
||||
|
||||
register_flag_optional(FUTHARK_BACKEND
|
||||
"Use a specific Futhark backend, possible options are:
|
||||
- c
|
||||
- multicore
|
||||
- opencl
|
||||
- cuda"
|
||||
"multicore")
|
||||
|
||||
register_flag_optional(FUTHARK_COMPILER
|
||||
"Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH"
|
||||
"futhark")
|
||||
|
||||
macro(setup)
|
||||
add_custom_command(
|
||||
OUTPUT
|
||||
${CMAKE_CURRENT_BINARY_DIR}/babelstream.c
|
||||
${CMAKE_CURRENT_BINARY_DIR}/babelstream.h
|
||||
COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND}
|
||||
--library src/futhark/babelstream.fut
|
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/babelstream
|
||||
DEPENDS src/futhark/babelstream.fut
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
VERBATIM
|
||||
)
|
||||
if (${FUTHARK_BACKEND} STREQUAL "c")
|
||||
# Nothing to do.
|
||||
elseif (${FUTHARK_BACKEND} STREQUAL "multicore")
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
register_link_library(Threads::Threads)
|
||||
elseif (${FUTHARK_BACKEND} STREQUAL "opencl")
|
||||
find_package(OpenCL REQUIRED)
|
||||
register_link_library(OpenCL::OpenCL)
|
||||
elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
|
||||
find_package(CUDA REQUIRED)
|
||||
register_link_library("nvrtc" "cuda" "cudart")
|
||||
else ()
|
||||
message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(setup_target)
|
||||
target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c")
|
||||
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
|
||||
endmacro()
|
||||
@ -9,7 +9,7 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
|
||||
|
||||
void check_error(void)
|
||||
{
|
||||
@ -45,34 +45,63 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
|
||||
// Print out device information
|
||||
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
|
||||
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
|
||||
#if defined(MANAGED)
|
||||
std::cout << "Memory: MANAGED" << std::endl;
|
||||
#elif defined(PAGEFAULT)
|
||||
std::cout << "Memory: PAGEFAULT" << std::endl;
|
||||
#else
|
||||
std::cout << "Memory: DEFAULT" << std::endl;
|
||||
#endif
|
||||
|
||||
array_size = ARRAY_SIZE;
|
||||
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
|
||||
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
|
||||
|
||||
// Allocate the host array for partial sums for dot kernels
|
||||
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
|
||||
size_t array_bytes = sizeof(T);
|
||||
array_bytes *= ARRAY_SIZE;
|
||||
size_t total_bytes = array_bytes * 3;
|
||||
|
||||
// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
|
||||
// This creates an array on the host which is visible to the device. However, it requires
|
||||
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
|
||||
// after it has been passed through to a kernel.
|
||||
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
|
||||
check_error();
|
||||
|
||||
// Check buffers fit on the device
|
||||
hipDeviceProp_t props;
|
||||
hipGetDeviceProperties(&props, 0);
|
||||
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
|
||||
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
|
||||
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
||||
|
||||
// Create device buffers
|
||||
hipMalloc(&d_a, ARRAY_SIZE*sizeof(T));
|
||||
// Create device buffers
|
||||
#if defined(MANAGED)
|
||||
hipMallocManaged(&d_a, array_bytes);
|
||||
check_error();
|
||||
hipMalloc(&d_b, ARRAY_SIZE*sizeof(T));
|
||||
hipMallocManaged(&d_b, array_bytes);
|
||||
check_error();
|
||||
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
||||
hipMallocManaged(&d_c, array_bytes);
|
||||
check_error();
|
||||
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||
#elif defined(PAGEFAULT)
|
||||
d_a = (T*)malloc(array_bytes);
|
||||
d_b = (T*)malloc(array_bytes);
|
||||
d_c = (T*)malloc(array_bytes);
|
||||
#else
|
||||
hipMalloc(&d_a, array_bytes);
|
||||
check_error();
|
||||
hipMalloc(&d_b, array_bytes);
|
||||
check_error();
|
||||
hipMalloc(&d_c, array_bytes);
|
||||
check_error();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
HIPStream<T>::~HIPStream()
|
||||
{
|
||||
free(sums);
|
||||
hipHostFree(sums);
|
||||
check_error();
|
||||
|
||||
hipFree(d_a);
|
||||
check_error();
|
||||
@ -80,15 +109,13 @@ HIPStream<T>::~HIPStream()
|
||||
check_error();
|
||||
hipFree(d_c);
|
||||
check_error();
|
||||
hipFree(d_sum);
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
@ -97,7 +124,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
template <class T>
|
||||
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
|
||||
init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -106,27 +133,37 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
template <class T>
|
||||
void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
|
||||
{
|
||||
|
||||
// Copy device memory to host
|
||||
#if defined(PAGEFAULT) || defined(MANAGED)
|
||||
hipDeviceSynchronize();
|
||||
for (int i = 0; i < array_size; i++)
|
||||
{
|
||||
a[i] = d_a[i];
|
||||
b[i] = d_b[i];
|
||||
c[i] = d_c[i];
|
||||
}
|
||||
#else
|
||||
hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
|
||||
check_error();
|
||||
hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
|
||||
check_error();
|
||||
hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
|
||||
check_error();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void copy_kernel(const T * a, T * c)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
c[i] = a[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::copy()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
|
||||
copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -136,14 +173,14 @@ template <typename T>
|
||||
__global__ void mul_kernel(T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
b[i] = scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::mul()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
|
||||
mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -152,14 +189,14 @@ void HIPStream<T>::mul()
|
||||
template <typename T>
|
||||
__global__ void add_kernel(const T * a, const T * b, T * c)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::add()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -169,14 +206,14 @@ template <typename T>
|
||||
__global__ void triad_kernel(T * a, const T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::triad()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -186,32 +223,32 @@ template <typename T>
|
||||
__global__ void nstream_kernel(T * a, const T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
a[i] += b[i] + scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::nstream()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
{
|
||||
__shared__ T tb_sum[TBSIZE];
|
||||
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t local_i = hipThreadIdx_x;
|
||||
const size_t local_i = threadIdx.x;
|
||||
size_t i = blockDim.x * blockIdx.x + local_i;
|
||||
|
||||
tb_sum[local_i] = 0.0;
|
||||
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
|
||||
tb_sum[local_i] = {};
|
||||
for (; i < array_size; i += blockDim.x*gridDim.x)
|
||||
tb_sum[local_i] += a[i] * b[i];
|
||||
|
||||
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
|
||||
for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
|
||||
{
|
||||
__syncthreads();
|
||||
if (local_i < offset)
|
||||
@ -221,20 +258,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
}
|
||||
|
||||
if (local_i == 0)
|
||||
sum[hipBlockIdx_x] = tb_sum[local_i];
|
||||
sum[blockIdx.x] = tb_sum[local_i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T HIPStream<T>::dot()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
|
||||
dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
|
||||
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
|
||||
check_error();
|
||||
|
||||
T sum = 0.0;
|
||||
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
|
||||
T sum{};
|
||||
for (int i = 0; i < dot_num_blocks; i++)
|
||||
sum += sums[i];
|
||||
|
||||
return sum;
|
||||
|
||||
@ -14,13 +14,31 @@
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "HIP"
|
||||
#define DOT_READ_DWORDS_PER_LANE 4
|
||||
|
||||
|
||||
template <class T>
|
||||
class HIPStream : public Stream<T>
|
||||
{
|
||||
// Make sure that either:
|
||||
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
|
||||
// or
|
||||
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
|
||||
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
|
||||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
|
||||
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
|
||||
|
||||
// Take into account the datatype size
|
||||
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
|
||||
// and 4 FP32 elements
|
||||
static constexpr unsigned int dot_elements_per_lane{
|
||||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
|
||||
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
|
||||
|
||||
protected:
|
||||
// Size of arrays
|
||||
int array_size;
|
||||
int dot_num_blocks;
|
||||
|
||||
// Host array for partial sums for dot kernel
|
||||
T *sums;
|
||||
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
|
||||
T *d_a;
|
||||
T *d_b;
|
||||
T *d_c;
|
||||
T *d_sum;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
@ -2,6 +2,13 @@
|
||||
register_flag_required(CMAKE_CXX_COMPILER
|
||||
"Absolute path to the AMD HIP C++ compiler")
|
||||
|
||||
register_flag_optional(MEM "Device memory mode:
|
||||
DEFAULT - allocate host and device memory pointers.
|
||||
MANAGED - use HIP Managed Memory.
|
||||
PAGEFAULT - shared memory, only host pointers allocated."
|
||||
"DEFAULT")
|
||||
|
||||
macro(setup)
|
||||
# nothing to do here as hipcc does everything correctly, what a surprise!
|
||||
register_definitions(${MEM})
|
||||
endmacro()
|
||||
@ -7,12 +7,12 @@
|
||||
|
||||
<artifactId>java-stream</artifactId>
|
||||
<groupId>javastream</groupId>
|
||||
<version>4.0</version>
|
||||
<version>5.0</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<junit.version>5.7.2</junit.version>
|
||||
<junit.version>5.9.2</junit.version>
|
||||
</properties>
|
||||
|
||||
<repositories>
|
||||
@ -27,19 +27,19 @@
|
||||
<dependency>
|
||||
<groupId>com.beust</groupId>
|
||||
<artifactId>jcommander</artifactId>
|
||||
<version>1.81</version>
|
||||
<version>1.82</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>tornado</groupId>
|
||||
<artifactId>tornado-api</artifactId>
|
||||
<version>0.9</version>
|
||||
<version>0.15.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.aparapi</groupId>
|
||||
<artifactId>aparapi</artifactId>
|
||||
<version>2.0.0</version>
|
||||
<version>3.0.0</version>
|
||||
<exclusions>
|
||||
<!-- don't pull in the entire Scala ecosystem! -->
|
||||
<exclusion>
|
||||
|
||||
@ -56,7 +56,7 @@ public abstract class JavaStream<T> {
|
||||
|
||||
protected abstract T dot();
|
||||
|
||||
protected abstract Data<T> data();
|
||||
protected abstract Data<T> readArrays();
|
||||
|
||||
public static class EnumeratedStream<T> extends JavaStream<T> {
|
||||
|
||||
@ -113,8 +113,8 @@ public abstract class JavaStream<T> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<T> data() {
|
||||
return actual.data();
|
||||
public Data<T> readArrays() {
|
||||
return actual.readArrays();
|
||||
}
|
||||
}
|
||||
|
||||
@ -140,6 +140,14 @@ public abstract class JavaStream<T> {
|
||||
return Duration.ofNanos(end - start);
|
||||
}
|
||||
|
||||
final Duration runInitArrays() {
|
||||
return timed(this::initArrays);
|
||||
}
|
||||
|
||||
final SimpleImmutableEntry<Duration, Data<T>> runReadArrays() {
|
||||
return timed(this::readArrays);
|
||||
}
|
||||
|
||||
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
|
||||
Timings<Duration> timings = new Timings<>();
|
||||
T lastSum = null;
|
||||
|
||||
@ -128,6 +128,40 @@ public class Main {
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
static void showInit(
|
||||
int totalBytes, double megaScale, Options opt, Duration init, Duration read) {
|
||||
List<Entry<String, Double>> setup =
|
||||
Arrays.asList(
|
||||
new SimpleImmutableEntry<>("Init", durationToSeconds(init)),
|
||||
new SimpleImmutableEntry<>("Read", durationToSeconds(read)));
|
||||
if (opt.csv) {
|
||||
tabulateCsv(
|
||||
true,
|
||||
setup.stream()
|
||||
.map(
|
||||
x ->
|
||||
Arrays.asList(
|
||||
new SimpleImmutableEntry<>("function", x.getKey()),
|
||||
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
|
||||
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
|
||||
new SimpleImmutableEntry<>(
|
||||
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec",
|
||||
((megaScale * (double) totalBytes / x.getValue())) + ""),
|
||||
new SimpleImmutableEntry<>("runtime", x.getValue() + "")))
|
||||
.toArray(List[]::new));
|
||||
} else {
|
||||
for (Entry<String, Double> e : setup) {
|
||||
System.out.printf(
|
||||
"%s: %.5f s (%.5f M%sBytes/sec)%n",
|
||||
e.getKey(),
|
||||
e.getValue(),
|
||||
megaScale * (double) totalBytes / e.getValue(),
|
||||
opt.mibibytes ? "i" : "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static <T extends Number> boolean run(
|
||||
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
|
||||
|
||||
@ -183,35 +217,46 @@ public class Main {
|
||||
|
||||
JavaStream<T> stream = mkStream.apply(config);
|
||||
|
||||
stream.initArrays();
|
||||
|
||||
Duration init = stream.runInitArrays();
|
||||
final boolean ok;
|
||||
switch (config.benchmark) {
|
||||
case ALL:
|
||||
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
|
||||
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue()));
|
||||
Timings<Duration> timings = results.getKey();
|
||||
tabulateCsv(
|
||||
opt.csv,
|
||||
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
|
||||
break;
|
||||
{
|
||||
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
|
||||
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
|
||||
showInit(totalBytes, megaScale, opt, init, read.getKey());
|
||||
ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue()));
|
||||
Timings<Duration> timings = results.getKey();
|
||||
tabulateCsv(
|
||||
opt.csv,
|
||||
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
|
||||
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
|
||||
break;
|
||||
}
|
||||
case NSTREAM:
|
||||
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
|
||||
ok = checkSolutions(stream.data(), config, Optional.empty());
|
||||
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
|
||||
break;
|
||||
{
|
||||
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
|
||||
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
|
||||
showInit(totalBytes, megaScale, opt, init, read.getKey());
|
||||
ok = checkSolutions(read.getValue(), config, Optional.empty());
|
||||
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
|
||||
break;
|
||||
}
|
||||
case TRIAD:
|
||||
Duration triadResult = stream.runTriad(opt.numtimes);
|
||||
ok = checkSolutions(stream.data(), config, Optional.empty());
|
||||
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
|
||||
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
|
||||
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
|
||||
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
|
||||
break;
|
||||
{
|
||||
Duration triadResult = stream.runTriad(opt.numtimes);
|
||||
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
|
||||
showInit(totalBytes, megaScale, opt, init, read.getKey());
|
||||
ok = checkSolutions(read.getValue(), config, Optional.empty());
|
||||
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
|
||||
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
|
||||
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
|
||||
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
@ -337,7 +382,7 @@ public class Main {
|
||||
}
|
||||
}
|
||||
|
||||
private static final String VERSION = "4.0";
|
||||
private static final String VERSION = "5.0";
|
||||
|
||||
private static final float START_SCALAR = 0.4f;
|
||||
private static final float START_A = 0.1f;
|
||||
|
||||
@ -122,7 +122,7 @@ public final class AparapiStreams {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<T> data() {
|
||||
public Data<T> readArrays() {
|
||||
return kernels.syncAndDispose();
|
||||
}
|
||||
}
|
||||
|
||||
@ -86,7 +86,7 @@ final class GenericPlainStream<T extends Number> extends JavaStream<T> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<T> data() {
|
||||
public Data<T> readArrays() {
|
||||
return new Data<>(a, b, c);
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,7 +80,7 @@ final class GenericStream<T extends Number> extends JavaStream<T> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<T> data() {
|
||||
public Data<T> readArrays() {
|
||||
return new Data<>(a, b, c);
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ final class SpecialisedDoubleStream extends JavaStream<Double> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Double> data() {
|
||||
public Data<Double> readArrays() {
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ final class SpecialisedFloatStream extends JavaStream<Float> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Float> data() {
|
||||
public Data<Float> readArrays() {
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Double> data() {
|
||||
public Data<Double> readArrays() {
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ final class SpecialisedPlainFloatStream extends JavaStream<Float> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Float> data() {
|
||||
public Data<Float> readArrays() {
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,8 +4,8 @@ import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import javastream.JavaStream;
|
||||
import javastream.Main.Config;
|
||||
import uk.ac.manchester.tornado.api.TaskSchedule;
|
||||
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
|
||||
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
|
||||
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
|
||||
import uk.ac.manchester.tornado.api.common.TornadoDevice;
|
||||
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
|
||||
|
||||
@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
|
||||
|
||||
protected final TornadoDevice device;
|
||||
|
||||
protected TaskSchedule copyTask;
|
||||
protected TaskSchedule mulTask;
|
||||
protected TaskSchedule addTask;
|
||||
protected TaskSchedule triadTask;
|
||||
protected TaskSchedule nstreamTask;
|
||||
protected TaskSchedule dotTask;
|
||||
protected TornadoExecutionPlan copyTask;
|
||||
protected TornadoExecutionPlan mulTask;
|
||||
protected TornadoExecutionPlan addTask;
|
||||
protected TornadoExecutionPlan triadTask;
|
||||
protected TornadoExecutionPlan nstreamTask;
|
||||
protected TornadoExecutionPlan dotTask;
|
||||
|
||||
GenericTornadoVMStream(Config<T> config) {
|
||||
super(config);
|
||||
|
||||
try {
|
||||
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime();
|
||||
TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime();
|
||||
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
|
||||
device = devices.get(config.options.device);
|
||||
|
||||
@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
|
||||
}
|
||||
}
|
||||
|
||||
protected static TaskSchedule mkSchedule() {
|
||||
return new TaskSchedule("");
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> listDevices() {
|
||||
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
|
||||
@ -55,12 +51,12 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
|
||||
|
||||
@Override
|
||||
public void initArrays() {
|
||||
this.copyTask.warmup();
|
||||
this.mulTask.warmup();
|
||||
this.addTask.warmup();
|
||||
this.triadTask.warmup();
|
||||
this.nstreamTask.warmup();
|
||||
this.dotTask.warmup();
|
||||
this.copyTask.withWarmUp();
|
||||
this.mulTask.withWarmUp();
|
||||
this.addTask.withWarmUp();
|
||||
this.triadTask.withWarmUp();
|
||||
this.nstreamTask.withWarmUp();
|
||||
this.dotTask.withWarmUp();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@ -2,8 +2,11 @@ package javastream.tornadovm;
|
||||
|
||||
import java.util.Arrays;
|
||||
import javastream.Main.Config;
|
||||
import uk.ac.manchester.tornado.api.TaskGraph;
|
||||
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
|
||||
import uk.ac.manchester.tornado.api.annotations.Parallel;
|
||||
import uk.ac.manchester.tornado.api.annotations.Reduce;
|
||||
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
|
||||
|
||||
final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
|
||||
|
||||
@ -49,7 +52,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
|
||||
private final double[] a, b, c;
|
||||
private final double[] dotSum;
|
||||
|
||||
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
|
||||
@SuppressWarnings({"DuplicatedCode"})
|
||||
SpecialisedDouble(Config<Double> config) {
|
||||
super(config);
|
||||
final int size = config.options.arraysize;
|
||||
@ -58,12 +61,43 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
|
||||
b = new double[size];
|
||||
c = new double[size];
|
||||
dotSum = new double[1];
|
||||
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c);
|
||||
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar);
|
||||
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c);
|
||||
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar);
|
||||
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar);
|
||||
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum);
|
||||
this.copyTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("copy")
|
||||
.task("copy", SpecialisedDouble::copy, size, a, c)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
|
||||
.snapshot());
|
||||
this.mulTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("mul")
|
||||
.task("mul", SpecialisedDouble::mul, size, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
|
||||
.snapshot());
|
||||
this.addTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("add")
|
||||
.task("add", SpecialisedDouble::add, size, a, b, c)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.triadTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("triad")
|
||||
.task("triad", SpecialisedDouble::triad, size, a, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.nstreamTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("nstream")
|
||||
.task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.dotTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("dot")
|
||||
.task("dot", SpecialisedDouble::dot_, a, b, dotSum)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
|
||||
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
|
||||
.snapshot());
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -72,7 +106,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
|
||||
Arrays.fill(a, config.initA);
|
||||
Arrays.fill(b, config.initB);
|
||||
Arrays.fill(c, config.initC);
|
||||
TornadoVMStreams.xferToDevice(device, a, b, c);
|
||||
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -81,7 +115,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Double> data() {
|
||||
public Data<Double> readArrays() {
|
||||
TornadoVMStreams.xferFromDevice(device, a, b, c);
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
|
||||
@ -2,8 +2,11 @@ package javastream.tornadovm;
|
||||
|
||||
import java.util.Arrays;
|
||||
import javastream.Main.Config;
|
||||
import uk.ac.manchester.tornado.api.TaskGraph;
|
||||
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
|
||||
import uk.ac.manchester.tornado.api.annotations.Parallel;
|
||||
import uk.ac.manchester.tornado.api.annotations.Reduce;
|
||||
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
|
||||
|
||||
final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
|
||||
|
||||
@ -49,7 +52,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
|
||||
private final float[] a, b, c;
|
||||
private final float[] dotSum;
|
||||
|
||||
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
|
||||
@SuppressWarnings({"DuplicatedCode"})
|
||||
SpecialisedFloat(Config<Float> config) {
|
||||
super(config);
|
||||
final int size = config.options.arraysize;
|
||||
@ -58,12 +61,43 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
|
||||
b = new float[size];
|
||||
c = new float[size];
|
||||
dotSum = new float[1];
|
||||
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c);
|
||||
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar);
|
||||
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c);
|
||||
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar);
|
||||
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar);
|
||||
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum);
|
||||
this.copyTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("copy")
|
||||
.task("copy", SpecialisedFloat::copy, size, a, c)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
|
||||
.snapshot());
|
||||
this.mulTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("mul")
|
||||
.task("mul", SpecialisedFloat::mul, size, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
|
||||
.snapshot());
|
||||
this.addTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("add")
|
||||
.task("add", SpecialisedFloat::add, size, a, b, c)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.triadTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("triad")
|
||||
.task("triad", SpecialisedFloat::triad, size, a, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.nstreamTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("nstream")
|
||||
.task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
|
||||
.snapshot());
|
||||
this.dotTask =
|
||||
new TornadoExecutionPlan(
|
||||
new TaskGraph("dot")
|
||||
.task("dot", SpecialisedFloat::dot_, a, b, dotSum)
|
||||
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
|
||||
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
|
||||
.snapshot());
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -72,7 +106,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
|
||||
Arrays.fill(a, config.initA);
|
||||
Arrays.fill(b, config.initB);
|
||||
Arrays.fill(c, config.initC);
|
||||
TornadoVMStreams.xferToDevice(device, a, b, c);
|
||||
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -81,7 +115,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Data<Float> data() {
|
||||
public Data<Float> readArrays() {
|
||||
TornadoVMStreams.xferFromDevice(device, a, b, c);
|
||||
return new Data<>(boxed(a), boxed(b), boxed(c));
|
||||
}
|
||||
|
||||
@ -1,36 +1,46 @@
|
||||
package javastream.tornadovm;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import javastream.JavaStream;
|
||||
import javastream.Main.Config;
|
||||
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
|
||||
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
|
||||
import uk.ac.manchester.tornado.api.common.Event;
|
||||
import uk.ac.manchester.tornado.api.common.TornadoDevice;
|
||||
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState;
|
||||
import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState;
|
||||
import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState;
|
||||
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
|
||||
|
||||
public final class TornadoVMStreams {
|
||||
|
||||
private TornadoVMStreams() {}
|
||||
|
||||
static void xferToDevice(TornadoDevice device, Object... xs) {
|
||||
static void allocAndXferToDevice(TornadoDevice device, Object... xs) {
|
||||
for (Object x : xs) {
|
||||
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
|
||||
device.allocateObjects(
|
||||
new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)});
|
||||
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
|
||||
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
|
||||
}
|
||||
}
|
||||
|
||||
static void xferFromDevice(TornadoDevice device, Object... xs) {
|
||||
for (Object x : xs) {
|
||||
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
|
||||
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn();
|
||||
}
|
||||
Arrays.stream(xs)
|
||||
.map(
|
||||
x -> {
|
||||
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
|
||||
return device.resolveEvent(
|
||||
device.streamOut(x, 0, state.getDeviceState(device), null));
|
||||
})
|
||||
.collect(Collectors.toList())
|
||||
.forEach(Event::waitOn);
|
||||
}
|
||||
|
||||
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) {
|
||||
static List<TornadoDevice> enumerateDevices(TornadoRuntimeInterface runtime) {
|
||||
return IntStream.range(0, runtime.getNumDrivers())
|
||||
.mapToObj(runtime::getDriver)
|
||||
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))
|
||||
|
||||
@ -1,415 +1,423 @@
|
||||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[AMDGPU]]
|
||||
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"]
|
||||
git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7"
|
||||
julia_version = "1.9.3"
|
||||
manifest_format = "2.0"
|
||||
project_hash = "05982ec0602af8ada9509107382dd6c8b21db9b9"
|
||||
|
||||
[[deps.AMDGPU]]
|
||||
deps = ["AbstractFFTs", "Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Preferences", "Printf", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "UnsafeAtomicsLLVM"]
|
||||
git-tree-sha1 = "95437cf4c0ad651ca8463475de8af6a6935e23bd"
|
||||
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
|
||||
version = "0.2.17"
|
||||
version = "0.6.1"
|
||||
|
||||
[[AbstractFFTs]]
|
||||
[[deps.AbstractFFTs]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
|
||||
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
|
||||
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
|
||||
version = "1.0.1"
|
||||
version = "1.5.0"
|
||||
|
||||
[[Adapt]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
|
||||
[deps.AbstractFFTs.extensions]
|
||||
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
|
||||
AbstractFFTsTestExt = "Test"
|
||||
|
||||
[deps.AbstractFFTs.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[deps.Adapt]]
|
||||
deps = ["LinearAlgebra", "Requires"]
|
||||
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
|
||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||
version = "3.3.1"
|
||||
version = "3.6.2"
|
||||
weakdeps = ["StaticArrays"]
|
||||
|
||||
[[ArgParse]]
|
||||
[deps.Adapt.extensions]
|
||||
AdaptStaticArraysExt = "StaticArrays"
|
||||
|
||||
[[deps.ArgParse]]
|
||||
deps = ["Logging", "TextWrap"]
|
||||
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
|
||||
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
version = "1.1.4"
|
||||
|
||||
[[ArgTools]]
|
||||
[[deps.ArgTools]]
|
||||
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
|
||||
version = "1.1.1"
|
||||
|
||||
[[Artifacts]]
|
||||
[[deps.Artifacts]]
|
||||
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
|
||||
|
||||
[[Base64]]
|
||||
[[deps.Atomix]]
|
||||
deps = ["UnsafeAtomics"]
|
||||
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
|
||||
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
|
||||
version = "0.1.0"
|
||||
|
||||
[[deps.Base64]]
|
||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||
|
||||
[[BinaryProvider]]
|
||||
deps = ["Libdl", "Logging", "SHA"]
|
||||
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
|
||||
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
|
||||
version = "0.5.10"
|
||||
|
||||
[[Bzip2_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
|
||||
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
|
||||
version = "1.0.8+0"
|
||||
|
||||
[[CEnum]]
|
||||
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
|
||||
[[deps.CEnum]]
|
||||
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
|
||||
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
|
||||
[[ConstructionBase]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
|
||||
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
|
||||
version = "1.3.0"
|
||||
[[deps.CompilerSupportLibraries_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
||||
version = "1.0.5+0"
|
||||
|
||||
[[Dates]]
|
||||
[[deps.Dates]]
|
||||
deps = ["Printf"]
|
||||
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||
|
||||
[[Downloads]]
|
||||
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
|
||||
[[deps.DocStringExtensions]]
|
||||
deps = ["LibGit2"]
|
||||
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
|
||||
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
|
||||
version = "0.9.3"
|
||||
|
||||
[[deps.Downloads]]
|
||||
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
|
||||
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
|
||||
version = "1.6.0"
|
||||
|
||||
[[Elfutils_jll]]
|
||||
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
|
||||
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
|
||||
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
|
||||
version = "0.182.0+0"
|
||||
|
||||
[[ExprTools]]
|
||||
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
|
||||
[[deps.ExprTools]]
|
||||
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
|
||||
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
|
||||
version = "0.1.6"
|
||||
version = "0.1.10"
|
||||
|
||||
[[Future]]
|
||||
deps = ["Random"]
|
||||
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
|
||||
[[deps.FileWatching]]
|
||||
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
|
||||
|
||||
[[GPUArrays]]
|
||||
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
|
||||
[[deps.GPUArrays]]
|
||||
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
|
||||
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||
version = "8.1.2"
|
||||
version = "9.0.0"
|
||||
|
||||
[[GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
|
||||
[[deps.GPUArraysCore]]
|
||||
deps = ["Adapt"]
|
||||
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
|
||||
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
|
||||
version = "0.1.5"
|
||||
|
||||
[[deps.GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
|
||||
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
|
||||
version = "0.12.9"
|
||||
version = "0.24.5"
|
||||
|
||||
[[HIP_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
|
||||
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
|
||||
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
|
||||
version = "4.0.0+1"
|
||||
|
||||
[[InteractiveUtils]]
|
||||
[[deps.InteractiveUtils]]
|
||||
deps = ["Markdown"]
|
||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||
|
||||
[[JLLWrappers]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
|
||||
[[deps.IrrationalConstants]]
|
||||
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
|
||||
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
|
||||
version = "0.2.2"
|
||||
|
||||
[[deps.JLLWrappers]]
|
||||
deps = ["Artifacts", "Preferences"]
|
||||
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
|
||||
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
|
||||
version = "1.3.0"
|
||||
version = "1.5.0"
|
||||
|
||||
[[LLVM]]
|
||||
[[deps.KernelAbstractions]]
|
||||
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
|
||||
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
|
||||
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
|
||||
version = "0.9.8"
|
||||
|
||||
[deps.KernelAbstractions.extensions]
|
||||
EnzymeExt = "EnzymeCore"
|
||||
|
||||
[deps.KernelAbstractions.weakdeps]
|
||||
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
|
||||
|
||||
[[deps.LLD_jll]]
|
||||
deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
|
||||
uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109"
|
||||
version = "14.0.6+3"
|
||||
|
||||
[[deps.LLVM]]
|
||||
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
|
||||
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
|
||||
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
|
||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||
version = "4.7.0"
|
||||
version = "6.2.1"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
|
||||
[[deps.LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.13+0"
|
||||
version = "0.0.25+0"
|
||||
|
||||
[[LibCURL]]
|
||||
[[deps.LLVM_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"]
|
||||
git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c"
|
||||
uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c"
|
||||
version = "14.0.6+4"
|
||||
|
||||
[[deps.LazyArtifacts]]
|
||||
deps = ["Artifacts", "Pkg"]
|
||||
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
|
||||
|
||||
[[deps.LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
|
||||
version = "0.6.3"
|
||||
|
||||
[[LibCURL_jll]]
|
||||
[[deps.LibCURL_jll]]
|
||||
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
|
||||
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
|
||||
version = "7.84.0+0"
|
||||
|
||||
[[LibGit2]]
|
||||
[[deps.LibGit2]]
|
||||
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
|
||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||
|
||||
[[LibSSH2_jll]]
|
||||
[[deps.LibSSH2_jll]]
|
||||
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
|
||||
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
|
||||
version = "1.10.2+0"
|
||||
|
||||
[[Libdl]]
|
||||
[[deps.Libdl]]
|
||||
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||
|
||||
[[Libgcrypt_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
|
||||
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
|
||||
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
|
||||
version = "1.8.7+0"
|
||||
|
||||
[[Libglvnd_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
|
||||
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
|
||||
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
|
||||
version = "1.3.0+3"
|
||||
|
||||
[[Libgpg_error_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
|
||||
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
|
||||
version = "1.42.0+0"
|
||||
|
||||
[[Libiconv_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
|
||||
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
|
||||
version = "1.16.1+1"
|
||||
|
||||
[[LinearAlgebra]]
|
||||
deps = ["Libdl"]
|
||||
[[deps.LinearAlgebra]]
|
||||
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
|
||||
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
|
||||
[[Logging]]
|
||||
[[deps.LogExpFunctions]]
|
||||
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
|
||||
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
|
||||
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
|
||||
version = "0.3.26"
|
||||
|
||||
[deps.LogExpFunctions.extensions]
|
||||
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
|
||||
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
|
||||
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
|
||||
|
||||
[deps.LogExpFunctions.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
|
||||
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
|
||||
|
||||
[[deps.Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[MacroTools]]
|
||||
[[deps.MacroTools]]
|
||||
deps = ["Markdown", "Random"]
|
||||
git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf"
|
||||
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
|
||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||
version = "0.5.9"
|
||||
version = "0.5.11"
|
||||
|
||||
[[Markdown]]
|
||||
[[deps.Markdown]]
|
||||
deps = ["Base64"]
|
||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||
|
||||
[[MbedTLS_jll]]
|
||||
[[deps.MbedTLS_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
|
||||
version = "2.28.2+0"
|
||||
|
||||
[[MozillaCACerts_jll]]
|
||||
[[deps.MozillaCACerts_jll]]
|
||||
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
|
||||
version = "2022.10.11"
|
||||
|
||||
[[NUMA_jll]]
|
||||
deps = ["Libdl", "Pkg"]
|
||||
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
|
||||
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
|
||||
version = "2.0.13+1"
|
||||
|
||||
[[NetworkOptions]]
|
||||
[[deps.NetworkOptions]]
|
||||
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
|
||||
version = "1.2.0"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
|
||||
[[deps.OpenBLAS_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
|
||||
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
|
||||
version = "0.3.21+4"
|
||||
|
||||
[[deps.OpenLibm_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
|
||||
version = "0.8.1+0"
|
||||
|
||||
[[deps.OpenSpecFun_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
|
||||
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
||||
version = "0.5.5+0"
|
||||
|
||||
[[deps.OrderedCollections]]
|
||||
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.4.1"
|
||||
version = "1.6.2"
|
||||
|
||||
[[Parameters]]
|
||||
[[deps.Parameters]]
|
||||
deps = ["OrderedCollections", "UnPack"]
|
||||
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
|
||||
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
version = "0.12.3"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
[[deps.Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
version = "1.9.2"
|
||||
|
||||
[[Preferences]]
|
||||
[[deps.PrecompileTools]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
|
||||
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
|
||||
version = "1.2.0"
|
||||
|
||||
[[deps.Preferences]]
|
||||
deps = ["TOML"]
|
||||
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
|
||||
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
|
||||
uuid = "21216c6a-2e73-6563-6e65-726566657250"
|
||||
version = "1.2.2"
|
||||
version = "1.4.1"
|
||||
|
||||
[[Printf]]
|
||||
[[deps.Printf]]
|
||||
deps = ["Unicode"]
|
||||
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||
|
||||
[[REPL]]
|
||||
[[deps.REPL]]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
|
||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||
|
||||
[[ROCmCompilerSupport_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
|
||||
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
|
||||
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
|
||||
version = "4.0.0+1"
|
||||
|
||||
[[ROCmDeviceLibs_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
|
||||
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
|
||||
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
|
||||
version = "4.0.0+0"
|
||||
|
||||
[[ROCmOpenCLRuntime_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
|
||||
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
|
||||
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
|
||||
version = "4.0.0+1"
|
||||
|
||||
[[Random]]
|
||||
deps = ["Serialization"]
|
||||
[[deps.Random]]
|
||||
deps = ["SHA", "Serialization"]
|
||||
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
|
||||
[[Requires]]
|
||||
[[deps.Reexport]]
|
||||
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
|
||||
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
|
||||
version = "1.2.2"
|
||||
|
||||
[[deps.Requires]]
|
||||
deps = ["UUIDs"]
|
||||
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a"
|
||||
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
|
||||
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
||||
version = "1.3.0"
|
||||
|
||||
[[deps.SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
version = "0.7.0"
|
||||
|
||||
[[deps.Scratch]]
|
||||
deps = ["Dates"]
|
||||
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
|
||||
uuid = "6c6a2e73-6563-6170-7368-637461726353"
|
||||
version = "1.2.0"
|
||||
|
||||
[[SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
|
||||
[[Serialization]]
|
||||
[[deps.Serialization]]
|
||||
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
||||
|
||||
[[Setfield]]
|
||||
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
|
||||
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
|
||||
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
|
||||
version = "0.7.1"
|
||||
|
||||
[[Sockets]]
|
||||
[[deps.Sockets]]
|
||||
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
||||
|
||||
[[SparseArrays]]
|
||||
deps = ["LinearAlgebra", "Random"]
|
||||
[[deps.SparseArrays]]
|
||||
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
|
||||
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||
|
||||
[[Statistics]]
|
||||
[[deps.SpecialFunctions]]
|
||||
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
|
||||
git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
|
||||
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||
version = "2.3.1"
|
||||
|
||||
[deps.SpecialFunctions.extensions]
|
||||
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
|
||||
|
||||
[deps.SpecialFunctions.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
|
||||
[[deps.StaticArrays]]
|
||||
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
|
||||
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
|
||||
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
version = "1.6.4"
|
||||
weakdeps = ["Statistics"]
|
||||
|
||||
[deps.StaticArrays.extensions]
|
||||
StaticArraysStatisticsExt = "Statistics"
|
||||
|
||||
[[deps.StaticArraysCore]]
|
||||
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
|
||||
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
|
||||
version = "1.4.2"
|
||||
|
||||
[[deps.Statistics]]
|
||||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||
version = "1.9.0"
|
||||
|
||||
[[TOML]]
|
||||
[[deps.SuiteSparse_jll]]
|
||||
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
|
||||
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
|
||||
version = "5.10.1+6"
|
||||
|
||||
[[deps.TOML]]
|
||||
deps = ["Dates"]
|
||||
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||
version = "1.0.3"
|
||||
|
||||
[[Tar]]
|
||||
[[deps.Tar]]
|
||||
deps = ["ArgTools", "SHA"]
|
||||
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
|
||||
version = "1.10.0"
|
||||
|
||||
[[TextWrap]]
|
||||
[[deps.TextWrap]]
|
||||
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
|
||||
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
|
||||
version = "1.0.1"
|
||||
|
||||
[[TimerOutputs]]
|
||||
[[deps.TimerOutputs]]
|
||||
deps = ["ExprTools", "Printf"]
|
||||
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
|
||||
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
|
||||
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
|
||||
version = "0.5.13"
|
||||
version = "0.5.23"
|
||||
|
||||
[[UUIDs]]
|
||||
[[deps.UUIDs]]
|
||||
deps = ["Random", "SHA"]
|
||||
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
||||
|
||||
[[UnPack]]
|
||||
[[deps.UnPack]]
|
||||
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
|
||||
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
|
||||
version = "1.0.2"
|
||||
|
||||
[[Unicode]]
|
||||
[[deps.Unicode]]
|
||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||
|
||||
[[XML2_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
|
||||
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
|
||||
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
|
||||
version = "2.9.12+0"
|
||||
[[deps.UnsafeAtomics]]
|
||||
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
|
||||
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
|
||||
version = "0.2.1"
|
||||
|
||||
[[XSLT_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
|
||||
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
|
||||
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
|
||||
version = "1.1.34+0"
|
||||
[[deps.UnsafeAtomicsLLVM]]
|
||||
deps = ["LLVM", "UnsafeAtomics"]
|
||||
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
|
||||
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
|
||||
version = "0.1.3"
|
||||
|
||||
[[XZ_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
|
||||
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
|
||||
version = "5.2.5+2"
|
||||
|
||||
[[Xorg_libX11_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
|
||||
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
|
||||
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
|
||||
version = "1.6.9+4"
|
||||
|
||||
[[Xorg_libXau_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
|
||||
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
|
||||
version = "1.0.9+4"
|
||||
|
||||
[[Xorg_libXdmcp_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
|
||||
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
|
||||
version = "1.1.3+4"
|
||||
|
||||
[[Xorg_libXext_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
|
||||
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
|
||||
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
|
||||
version = "1.3.4+4"
|
||||
|
||||
[[Xorg_libpthread_stubs_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
|
||||
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
|
||||
version = "0.1.0+3"
|
||||
|
||||
[[Xorg_libxcb_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
|
||||
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
|
||||
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
|
||||
version = "1.13.0+3"
|
||||
|
||||
[[Xorg_xorgproto_jll]]
|
||||
deps = ["Libdl", "Pkg"]
|
||||
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
|
||||
uuid = "c4d99508-4286-5418-9131-c86396af500b"
|
||||
version = "2019.2.0+2"
|
||||
|
||||
[[Xorg_xtrans_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
|
||||
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
|
||||
version = "1.4.0+3"
|
||||
|
||||
[[Zlib_jll]]
|
||||
[[deps.Zlib_jll]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
version = "1.2.13+0"
|
||||
|
||||
[[argp_standalone_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
|
||||
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
|
||||
version = "1.3.1+0"
|
||||
[[deps.libLLVM_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a"
|
||||
version = "14.0.6+3"
|
||||
|
||||
[[fts_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
|
||||
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
|
||||
version = "1.2.7+1"
|
||||
[[deps.libblastrampoline_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
|
||||
version = "5.8.0+0"
|
||||
|
||||
[[hsa_rocr_jll]]
|
||||
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
|
||||
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
|
||||
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
|
||||
version = "4.0.0+0"
|
||||
|
||||
[[hsakmt_roct_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
|
||||
git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00"
|
||||
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
|
||||
version = "4.2.0+0"
|
||||
|
||||
[[nghttp2_jll]]
|
||||
[[deps.nghttp2_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||
version = "1.48.0+0"
|
||||
|
||||
[[obstack_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
|
||||
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
|
||||
version = "1.2.2+0"
|
||||
|
||||
[[p7zip_jll]]
|
||||
[[deps.p7zip_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
|
||||
version = "17.4.0+0"
|
||||
|
||||
@ -4,4 +4,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
@ -1,332 +1,555 @@
|
||||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[AbstractFFTs]]
|
||||
julia_version = "1.9.3"
|
||||
manifest_format = "2.0"
|
||||
project_hash = "6909ef39c97ad6037791040bed70b7aa111e1f64"
|
||||
|
||||
[[deps.AbstractFFTs]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
|
||||
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
|
||||
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
|
||||
version = "1.0.1"
|
||||
version = "1.5.0"
|
||||
|
||||
[[Adapt]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
|
||||
[deps.AbstractFFTs.extensions]
|
||||
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
|
||||
AbstractFFTsTestExt = "Test"
|
||||
|
||||
[deps.AbstractFFTs.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[deps.Adapt]]
|
||||
deps = ["LinearAlgebra", "Requires"]
|
||||
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
|
||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||
version = "3.3.1"
|
||||
version = "3.6.2"
|
||||
weakdeps = ["StaticArrays"]
|
||||
|
||||
[[ArgParse]]
|
||||
[deps.Adapt.extensions]
|
||||
AdaptStaticArraysExt = "StaticArrays"
|
||||
|
||||
[[deps.ArgParse]]
|
||||
deps = ["Logging", "TextWrap"]
|
||||
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
|
||||
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
version = "1.1.4"
|
||||
|
||||
[[ArgTools]]
|
||||
[[deps.ArgTools]]
|
||||
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
|
||||
version = "1.1.1"
|
||||
|
||||
[[Artifacts]]
|
||||
[[deps.Artifacts]]
|
||||
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
|
||||
|
||||
[[BFloat16s]]
|
||||
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
|
||||
git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072"
|
||||
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
|
||||
version = "0.2.0"
|
||||
[[deps.Atomix]]
|
||||
deps = ["UnsafeAtomics"]
|
||||
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
|
||||
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
|
||||
version = "0.1.0"
|
||||
|
||||
[[Base64]]
|
||||
[[deps.BFloat16s]]
|
||||
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
|
||||
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
|
||||
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
|
||||
version = "0.4.2"
|
||||
|
||||
[[deps.Base64]]
|
||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||
|
||||
[[CEnum]]
|
||||
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
|
||||
[[deps.CEnum]]
|
||||
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
|
||||
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
|
||||
[[CUDA]]
|
||||
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
|
||||
git-tree-sha1 = "1f8ebf85abb7d1eff965730e592794a27c1350d8"
|
||||
[[deps.CUDA]]
|
||||
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"]
|
||||
git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99"
|
||||
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
version = "3.6.0"
|
||||
version = "5.0.0"
|
||||
|
||||
[[ChainRulesCore]]
|
||||
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
|
||||
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
|
||||
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
version = "1.11.2"
|
||||
[deps.CUDA.extensions]
|
||||
SpecialFunctionsExt = "SpecialFunctions"
|
||||
|
||||
[[ChangesOfVariables]]
|
||||
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
|
||||
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
|
||||
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
|
||||
version = "0.1.2"
|
||||
[deps.CUDA.weakdeps]
|
||||
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||
|
||||
[[Compat]]
|
||||
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
|
||||
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
|
||||
[[deps.CUDA_Driver_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "35a37bb72b35964f2895c12c687ae263b4ac170c"
|
||||
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
|
||||
version = "0.6.0+3"
|
||||
|
||||
[[deps.CUDA_Runtime_Discovery]]
|
||||
deps = ["Libdl"]
|
||||
git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536"
|
||||
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
|
||||
version = "0.2.2"
|
||||
|
||||
[[deps.CUDA_Runtime_jll]]
|
||||
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "bfe5a693a11522d58392f742243f2b50dc27afd6"
|
||||
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
|
||||
version = "0.9.2+0"
|
||||
|
||||
[[deps.ColorTypes]]
|
||||
deps = ["FixedPointNumbers", "Random"]
|
||||
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
|
||||
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
|
||||
version = "0.11.4"
|
||||
|
||||
[[deps.Colors]]
|
||||
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
|
||||
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
|
||||
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
|
||||
version = "0.12.10"
|
||||
|
||||
[[deps.Compat]]
|
||||
deps = ["UUIDs"]
|
||||
git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7"
|
||||
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
|
||||
version = "3.41.0"
|
||||
version = "4.9.0"
|
||||
weakdeps = ["Dates", "LinearAlgebra"]
|
||||
|
||||
[[CompilerSupportLibraries_jll]]
|
||||
[deps.Compat.extensions]
|
||||
CompatLinearAlgebraExt = "LinearAlgebra"
|
||||
|
||||
[[deps.CompilerSupportLibraries_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
||||
version = "1.0.5+0"
|
||||
|
||||
[[Dates]]
|
||||
[[deps.Crayons]]
|
||||
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
|
||||
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
|
||||
version = "4.1.1"
|
||||
|
||||
[[deps.DataAPI]]
|
||||
git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
|
||||
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
|
||||
version = "1.15.0"
|
||||
|
||||
[[deps.DataFrames]]
|
||||
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
|
||||
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
|
||||
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
||||
version = "1.6.1"
|
||||
|
||||
[[deps.DataStructures]]
|
||||
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
|
||||
git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
|
||||
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
version = "0.18.15"
|
||||
|
||||
[[deps.DataValueInterfaces]]
|
||||
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
|
||||
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
|
||||
version = "1.0.0"
|
||||
|
||||
[[deps.Dates]]
|
||||
deps = ["Printf"]
|
||||
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||
|
||||
[[DelimitedFiles]]
|
||||
deps = ["Mmap"]
|
||||
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
||||
|
||||
[[Distributed]]
|
||||
deps = ["Random", "Serialization", "Sockets"]
|
||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||
|
||||
[[DocStringExtensions]]
|
||||
deps = ["LibGit2"]
|
||||
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
|
||||
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
|
||||
version = "0.8.6"
|
||||
|
||||
[[Downloads]]
|
||||
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
|
||||
[[deps.Downloads]]
|
||||
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
|
||||
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
|
||||
version = "1.6.0"
|
||||
|
||||
[[ExprTools]]
|
||||
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
|
||||
[[deps.ExprTools]]
|
||||
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
|
||||
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
|
||||
version = "0.1.6"
|
||||
version = "0.1.10"
|
||||
|
||||
[[GPUArrays]]
|
||||
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
|
||||
[[deps.FileWatching]]
|
||||
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
|
||||
|
||||
[[deps.FixedPointNumbers]]
|
||||
deps = ["Statistics"]
|
||||
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
|
||||
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
|
||||
version = "0.8.4"
|
||||
|
||||
[[deps.Future]]
|
||||
deps = ["Random"]
|
||||
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
|
||||
|
||||
[[deps.GPUArrays]]
|
||||
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
|
||||
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||
version = "8.1.2"
|
||||
version = "9.0.0"
|
||||
|
||||
[[GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a"
|
||||
[[deps.GPUArraysCore]]
|
||||
deps = ["Adapt"]
|
||||
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
|
||||
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
|
||||
version = "0.1.5"
|
||||
|
||||
[[deps.GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
|
||||
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
|
||||
version = "0.13.10"
|
||||
version = "0.24.5"
|
||||
|
||||
[[InteractiveUtils]]
|
||||
[[deps.InlineStrings]]
|
||||
deps = ["Parsers"]
|
||||
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
|
||||
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
|
||||
version = "1.4.0"
|
||||
|
||||
[[deps.InteractiveUtils]]
|
||||
deps = ["Markdown"]
|
||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||
|
||||
[[InverseFunctions]]
|
||||
deps = ["Test"]
|
||||
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
|
||||
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
|
||||
version = "0.1.2"
|
||||
|
||||
[[IrrationalConstants]]
|
||||
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
|
||||
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
|
||||
version = "0.1.1"
|
||||
|
||||
[[JLLWrappers]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
|
||||
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
|
||||
[[deps.InvertedIndices]]
|
||||
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
|
||||
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
|
||||
version = "1.3.0"
|
||||
|
||||
[[LLVM]]
|
||||
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
|
||||
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
|
||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||
version = "4.7.0"
|
||||
[[deps.IteratorInterfaceExtensions]]
|
||||
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
|
||||
uuid = "82899510-4779-5014-852e-03e436cf321d"
|
||||
version = "1.0.0"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
[[deps.JLLWrappers]]
|
||||
deps = ["Artifacts", "Preferences"]
|
||||
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
|
||||
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
|
||||
version = "1.5.0"
|
||||
|
||||
[[deps.JuliaNVTXCallbacks_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.13+0"
|
||||
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
|
||||
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
|
||||
version = "0.2.1+0"
|
||||
|
||||
[[LazyArtifacts]]
|
||||
[[deps.KernelAbstractions]]
|
||||
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
|
||||
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
|
||||
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
|
||||
version = "0.9.8"
|
||||
|
||||
[deps.KernelAbstractions.extensions]
|
||||
EnzymeExt = "EnzymeCore"
|
||||
|
||||
[deps.KernelAbstractions.weakdeps]
|
||||
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
|
||||
|
||||
[[deps.LLVM]]
|
||||
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
|
||||
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
|
||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||
version = "6.2.1"
|
||||
|
||||
[[deps.LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.25+0"
|
||||
|
||||
[[deps.LaTeXStrings]]
|
||||
git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
|
||||
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
|
||||
version = "1.3.0"
|
||||
|
||||
[[deps.LazyArtifacts]]
|
||||
deps = ["Artifacts", "Pkg"]
|
||||
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
|
||||
|
||||
[[LibCURL]]
|
||||
[[deps.LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
|
||||
version = "0.6.3"
|
||||
|
||||
[[LibCURL_jll]]
|
||||
[[deps.LibCURL_jll]]
|
||||
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
|
||||
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
|
||||
version = "7.84.0+0"
|
||||
|
||||
[[LibGit2]]
|
||||
[[deps.LibGit2]]
|
||||
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
|
||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||
|
||||
[[LibSSH2_jll]]
|
||||
[[deps.LibSSH2_jll]]
|
||||
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
|
||||
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
|
||||
version = "1.10.2+0"
|
||||
|
||||
[[Libdl]]
|
||||
[[deps.Libdl]]
|
||||
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||
|
||||
[[LinearAlgebra]]
|
||||
deps = ["Libdl"]
|
||||
[[deps.LinearAlgebra]]
|
||||
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
|
||||
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
|
||||
[[LogExpFunctions]]
|
||||
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
|
||||
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
|
||||
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
|
||||
version = "0.3.6"
|
||||
|
||||
[[Logging]]
|
||||
[[deps.Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[Markdown]]
|
||||
[[deps.MacroTools]]
|
||||
deps = ["Markdown", "Random"]
|
||||
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
|
||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||
version = "0.5.11"
|
||||
|
||||
[[deps.Markdown]]
|
||||
deps = ["Base64"]
|
||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||
|
||||
[[MbedTLS_jll]]
|
||||
[[deps.MbedTLS_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
|
||||
version = "2.28.2+0"
|
||||
|
||||
[[Mmap]]
|
||||
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||
[[deps.Missings]]
|
||||
deps = ["DataAPI"]
|
||||
git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
|
||||
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
|
||||
version = "1.1.0"
|
||||
|
||||
[[MozillaCACerts_jll]]
|
||||
[[deps.MozillaCACerts_jll]]
|
||||
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
|
||||
version = "2022.10.11"
|
||||
|
||||
[[NetworkOptions]]
|
||||
[[deps.NVTX]]
|
||||
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
|
||||
git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34"
|
||||
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
|
||||
version = "0.3.3"
|
||||
|
||||
[[deps.NVTX_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
|
||||
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
|
||||
version = "3.1.0+2"
|
||||
|
||||
[[deps.NetworkOptions]]
|
||||
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
|
||||
version = "1.2.0"
|
||||
|
||||
[[OpenLibm_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
|
||||
[[deps.OpenBLAS_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
|
||||
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
|
||||
version = "0.3.21+4"
|
||||
|
||||
[[OpenSpecFun_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
|
||||
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
||||
version = "0.5.5+0"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
|
||||
[[deps.OrderedCollections]]
|
||||
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.4.1"
|
||||
version = "1.6.2"
|
||||
|
||||
[[Parameters]]
|
||||
[[deps.Parameters]]
|
||||
deps = ["OrderedCollections", "UnPack"]
|
||||
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
|
||||
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
version = "0.12.3"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
[[deps.Parsers]]
|
||||
deps = ["Dates", "PrecompileTools", "UUIDs"]
|
||||
git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
|
||||
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
|
||||
version = "2.7.2"
|
||||
|
||||
[[deps.Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
version = "1.9.2"
|
||||
|
||||
[[Preferences]]
|
||||
[[deps.PooledArrays]]
|
||||
deps = ["DataAPI", "Future"]
|
||||
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
|
||||
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
|
||||
version = "1.4.3"
|
||||
|
||||
[[deps.PrecompileTools]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
|
||||
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
|
||||
version = "1.2.0"
|
||||
|
||||
[[deps.Preferences]]
|
||||
deps = ["TOML"]
|
||||
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
|
||||
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
|
||||
uuid = "21216c6a-2e73-6563-6e65-726566657250"
|
||||
version = "1.2.2"
|
||||
version = "1.4.1"
|
||||
|
||||
[[Printf]]
|
||||
[[deps.PrettyTables]]
|
||||
deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"]
|
||||
git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a"
|
||||
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
|
||||
version = "2.2.7"
|
||||
|
||||
[[deps.Printf]]
|
||||
deps = ["Unicode"]
|
||||
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||
|
||||
[[REPL]]
|
||||
[[deps.REPL]]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
|
||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||
|
||||
[[Random]]
|
||||
deps = ["Serialization"]
|
||||
[[deps.Random]]
|
||||
deps = ["SHA", "Serialization"]
|
||||
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
|
||||
[[Random123]]
|
||||
deps = ["Libdl", "Random", "RandomNumbers"]
|
||||
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
|
||||
[[deps.Random123]]
|
||||
deps = ["Random", "RandomNumbers"]
|
||||
git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
|
||||
uuid = "74087812-796a-5b5d-8853-05524746bad3"
|
||||
version = "1.4.2"
|
||||
version = "1.6.1"
|
||||
|
||||
[[RandomNumbers]]
|
||||
[[deps.RandomNumbers]]
|
||||
deps = ["Random", "Requires"]
|
||||
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
|
||||
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
|
||||
version = "1.5.3"
|
||||
|
||||
[[Reexport]]
|
||||
[[deps.Reexport]]
|
||||
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
|
||||
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
|
||||
version = "1.2.2"
|
||||
|
||||
[[Requires]]
|
||||
[[deps.Requires]]
|
||||
deps = ["UUIDs"]
|
||||
git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a"
|
||||
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
|
||||
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
||||
version = "1.3.0"
|
||||
|
||||
[[deps.SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
version = "0.7.0"
|
||||
|
||||
[[deps.Scratch]]
|
||||
deps = ["Dates"]
|
||||
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
|
||||
uuid = "6c6a2e73-6563-6170-7368-637461726353"
|
||||
version = "1.2.0"
|
||||
|
||||
[[SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
[[deps.SentinelArrays]]
|
||||
deps = ["Dates", "Random"]
|
||||
git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
|
||||
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
|
||||
version = "1.4.0"
|
||||
|
||||
[[Serialization]]
|
||||
[[deps.Serialization]]
|
||||
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
||||
|
||||
[[SharedArrays]]
|
||||
deps = ["Distributed", "Mmap", "Random", "Serialization"]
|
||||
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
|
||||
|
||||
[[Sockets]]
|
||||
[[deps.Sockets]]
|
||||
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
||||
|
||||
[[SparseArrays]]
|
||||
deps = ["LinearAlgebra", "Random"]
|
||||
[[deps.SortingAlgorithms]]
|
||||
deps = ["DataStructures"]
|
||||
git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee"
|
||||
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
|
||||
version = "1.1.1"
|
||||
|
||||
[[deps.SparseArrays]]
|
||||
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
|
||||
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||
|
||||
[[SpecialFunctions]]
|
||||
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
|
||||
git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e"
|
||||
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||
version = "2.0.0"
|
||||
[[deps.StaticArrays]]
|
||||
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
|
||||
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
|
||||
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
version = "1.6.4"
|
||||
weakdeps = ["Statistics"]
|
||||
|
||||
[[Statistics]]
|
||||
[deps.StaticArrays.extensions]
|
||||
StaticArraysStatisticsExt = "Statistics"
|
||||
|
||||
[[deps.StaticArraysCore]]
|
||||
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
|
||||
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
|
||||
version = "1.4.2"
|
||||
|
||||
[[deps.Statistics]]
|
||||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||
version = "1.9.0"
|
||||
|
||||
[[TOML]]
|
||||
[[deps.StringManipulation]]
|
||||
deps = ["PrecompileTools"]
|
||||
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
|
||||
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
|
||||
version = "0.3.4"
|
||||
|
||||
[[deps.SuiteSparse_jll]]
|
||||
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
|
||||
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
|
||||
version = "5.10.1+6"
|
||||
|
||||
[[deps.TOML]]
|
||||
deps = ["Dates"]
|
||||
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||
version = "1.0.3"
|
||||
|
||||
[[Tar]]
|
||||
[[deps.TableTraits]]
|
||||
deps = ["IteratorInterfaceExtensions"]
|
||||
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
|
||||
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
|
||||
version = "1.0.1"
|
||||
|
||||
[[deps.Tables]]
|
||||
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
|
||||
git-tree-sha1 = "a1f34829d5ac0ef499f6d84428bd6b4c71f02ead"
|
||||
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
|
||||
version = "1.11.0"
|
||||
|
||||
[[deps.Tar]]
|
||||
deps = ["ArgTools", "SHA"]
|
||||
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
|
||||
version = "1.10.0"
|
||||
|
||||
[[Test]]
|
||||
[[deps.Test]]
|
||||
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
|
||||
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[TextWrap]]
|
||||
[[deps.TextWrap]]
|
||||
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
|
||||
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
|
||||
version = "1.0.1"
|
||||
|
||||
[[TimerOutputs]]
|
||||
[[deps.TimerOutputs]]
|
||||
deps = ["ExprTools", "Printf"]
|
||||
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
|
||||
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
|
||||
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
|
||||
version = "0.5.13"
|
||||
version = "0.5.23"
|
||||
|
||||
[[UUIDs]]
|
||||
[[deps.UUIDs]]
|
||||
deps = ["Random", "SHA"]
|
||||
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
||||
|
||||
[[UnPack]]
|
||||
[[deps.UnPack]]
|
||||
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
|
||||
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
|
||||
version = "1.0.2"
|
||||
|
||||
[[Unicode]]
|
||||
[[deps.Unicode]]
|
||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||
|
||||
[[Zlib_jll]]
|
||||
[[deps.UnsafeAtomics]]
|
||||
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
|
||||
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
|
||||
version = "0.2.1"
|
||||
|
||||
[[deps.UnsafeAtomicsLLVM]]
|
||||
deps = ["LLVM", "UnsafeAtomics"]
|
||||
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
|
||||
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
|
||||
version = "0.1.3"
|
||||
|
||||
[[deps.Zlib_jll]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
version = "1.2.13+0"
|
||||
|
||||
[[nghttp2_jll]]
|
||||
[[deps.libblastrampoline_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
|
||||
version = "5.8.0+0"
|
||||
|
||||
[[deps.nghttp2_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||
version = "1.48.0+0"
|
||||
|
||||
[[p7zip_jll]]
|
||||
[[deps.p7zip_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
|
||||
version = "17.4.0+0"
|
||||
|
||||
@ -4,4 +4,4 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -8,4 +8,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -16,4 +16,4 @@ ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
|
||||
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
@ -1,31 +1,35 @@
|
||||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[ArgParse]]
|
||||
julia_version = "1.9.3"
|
||||
manifest_format = "2.0"
|
||||
project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029"
|
||||
|
||||
[[deps.ArgParse]]
|
||||
deps = ["Logging", "TextWrap"]
|
||||
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
|
||||
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
version = "1.1.4"
|
||||
|
||||
[[Logging]]
|
||||
[[deps.Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
|
||||
[[deps.OrderedCollections]]
|
||||
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.4.1"
|
||||
version = "1.6.2"
|
||||
|
||||
[[Parameters]]
|
||||
[[deps.Parameters]]
|
||||
deps = ["OrderedCollections", "UnPack"]
|
||||
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
|
||||
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
version = "0.12.3"
|
||||
|
||||
[[TextWrap]]
|
||||
[[deps.TextWrap]]
|
||||
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
|
||||
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
|
||||
version = "1.0.1"
|
||||
|
||||
[[UnPack]]
|
||||
[[deps.UnPack]]
|
||||
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
|
||||
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
|
||||
version = "1.0.2"
|
||||
|
||||
@ -3,4 +3,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
@ -1,335 +1,441 @@
|
||||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[Adapt]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
|
||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||
version = "3.3.1"
|
||||
julia_version = "1.9.3"
|
||||
manifest_format = "2.0"
|
||||
project_hash = "01f328e925b86927b3f24c30aee6ecdce5bd28cc"
|
||||
|
||||
[[ArgParse]]
|
||||
[[deps.Adapt]]
|
||||
deps = ["LinearAlgebra", "Requires"]
|
||||
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
|
||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||
version = "3.6.2"
|
||||
weakdeps = ["StaticArrays"]
|
||||
|
||||
[deps.Adapt.extensions]
|
||||
AdaptStaticArraysExt = "StaticArrays"
|
||||
|
||||
[[deps.ArgParse]]
|
||||
deps = ["Logging", "TextWrap"]
|
||||
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
|
||||
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
|
||||
version = "1.1.4"
|
||||
|
||||
[[ArgTools]]
|
||||
[[deps.ArgTools]]
|
||||
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
|
||||
version = "1.1.1"
|
||||
|
||||
[[Artifacts]]
|
||||
[[deps.Artifacts]]
|
||||
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
|
||||
|
||||
[[Base64]]
|
||||
[[deps.Atomix]]
|
||||
deps = ["UnsafeAtomics"]
|
||||
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
|
||||
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
|
||||
version = "0.1.0"
|
||||
|
||||
[[deps.Base64]]
|
||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||
|
||||
[[CEnum]]
|
||||
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
|
||||
[[deps.CEnum]]
|
||||
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
|
||||
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
|
||||
[[ChainRulesCore]]
|
||||
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
|
||||
git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47"
|
||||
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
version = "1.11.2"
|
||||
|
||||
[[ChangesOfVariables]]
|
||||
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
|
||||
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
|
||||
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
|
||||
version = "0.1.2"
|
||||
|
||||
[[Compat]]
|
||||
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
|
||||
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
|
||||
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
|
||||
version = "3.41.0"
|
||||
|
||||
[[CompilerSupportLibraries_jll]]
|
||||
[[deps.CompilerSupportLibraries_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
||||
version = "1.0.5+0"
|
||||
|
||||
[[Dates]]
|
||||
[[deps.Dates]]
|
||||
deps = ["Printf"]
|
||||
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||
|
||||
[[DelimitedFiles]]
|
||||
deps = ["Mmap"]
|
||||
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
||||
|
||||
[[Distributed]]
|
||||
deps = ["Random", "Serialization", "Sockets"]
|
||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||
|
||||
[[DocStringExtensions]]
|
||||
[[deps.DocStringExtensions]]
|
||||
deps = ["LibGit2"]
|
||||
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
|
||||
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
|
||||
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
|
||||
version = "0.8.6"
|
||||
version = "0.9.3"
|
||||
|
||||
[[Downloads]]
|
||||
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
|
||||
[[deps.Downloads]]
|
||||
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
|
||||
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
|
||||
version = "1.6.0"
|
||||
|
||||
[[ExprTools]]
|
||||
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
|
||||
[[deps.ExprTools]]
|
||||
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
|
||||
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
|
||||
version = "0.1.6"
|
||||
version = "0.1.10"
|
||||
|
||||
[[GPUArrays]]
|
||||
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0"
|
||||
[[deps.FileWatching]]
|
||||
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
|
||||
|
||||
[[deps.GPUArrays]]
|
||||
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
|
||||
git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
|
||||
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||
version = "8.1.2"
|
||||
version = "8.8.1"
|
||||
|
||||
[[GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a"
|
||||
[[deps.GPUArraysCore]]
|
||||
deps = ["Adapt"]
|
||||
git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
|
||||
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
|
||||
version = "0.1.5"
|
||||
|
||||
[[deps.GPUCompiler]]
|
||||
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
|
||||
git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
|
||||
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
|
||||
version = "0.13.10"
|
||||
version = "0.21.4"
|
||||
|
||||
[[InteractiveUtils]]
|
||||
[[deps.InteractiveUtils]]
|
||||
deps = ["Markdown"]
|
||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||
|
||||
[[InverseFunctions]]
|
||||
deps = ["Test"]
|
||||
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
|
||||
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
|
||||
version = "0.1.2"
|
||||
|
||||
[[IrrationalConstants]]
|
||||
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
|
||||
[[deps.IrrationalConstants]]
|
||||
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
|
||||
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
|
||||
version = "0.1.1"
|
||||
version = "0.2.2"
|
||||
|
||||
[[JLLWrappers]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
|
||||
[[deps.JLLWrappers]]
|
||||
deps = ["Artifacts", "Preferences"]
|
||||
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
|
||||
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
|
||||
version = "1.3.0"
|
||||
version = "1.5.0"
|
||||
|
||||
[[LLVM]]
|
||||
[[deps.KernelAbstractions]]
|
||||
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
|
||||
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
|
||||
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
|
||||
version = "0.9.8"
|
||||
|
||||
[deps.KernelAbstractions.extensions]
|
||||
EnzymeExt = "EnzymeCore"
|
||||
|
||||
[deps.KernelAbstractions.weakdeps]
|
||||
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
|
||||
|
||||
[[deps.LLVM]]
|
||||
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
|
||||
git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f"
|
||||
git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
|
||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||
version = "4.7.0"
|
||||
version = "6.2.1"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a"
|
||||
[[deps.LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.13+0"
|
||||
version = "0.0.25+0"
|
||||
|
||||
[[LibCURL]]
|
||||
[[deps.LazyArtifacts]]
|
||||
deps = ["Artifacts", "Pkg"]
|
||||
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
|
||||
|
||||
[[deps.LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
|
||||
version = "0.6.3"
|
||||
|
||||
[[LibCURL_jll]]
|
||||
[[deps.LibCURL_jll]]
|
||||
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
|
||||
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
|
||||
version = "7.84.0+0"
|
||||
|
||||
[[LibGit2]]
|
||||
[[deps.LibGit2]]
|
||||
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
|
||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||
|
||||
[[LibSSH2_jll]]
|
||||
[[deps.LibSSH2_jll]]
|
||||
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
|
||||
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
|
||||
version = "1.10.2+0"
|
||||
|
||||
[[Libdl]]
|
||||
[[deps.Libdl]]
|
||||
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||
|
||||
[[LinearAlgebra]]
|
||||
deps = ["Libdl"]
|
||||
[[deps.LinearAlgebra]]
|
||||
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
|
||||
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
|
||||
[[LogExpFunctions]]
|
||||
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
|
||||
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
|
||||
[[deps.LogExpFunctions]]
|
||||
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
|
||||
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
|
||||
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
|
||||
version = "0.3.6"
|
||||
version = "0.3.26"
|
||||
|
||||
[[Logging]]
|
||||
[deps.LogExpFunctions.extensions]
|
||||
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
|
||||
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
|
||||
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
|
||||
|
||||
[deps.LogExpFunctions.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
|
||||
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
|
||||
|
||||
[[deps.Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[Markdown]]
|
||||
[[deps.MacroTools]]
|
||||
deps = ["Markdown", "Random"]
|
||||
git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
|
||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||
version = "0.5.11"
|
||||
|
||||
[[deps.Markdown]]
|
||||
deps = ["Base64"]
|
||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||
|
||||
[[MbedTLS_jll]]
|
||||
[[deps.MbedTLS_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
|
||||
version = "2.28.2+0"
|
||||
|
||||
[[Mmap]]
|
||||
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||
|
||||
[[MozillaCACerts_jll]]
|
||||
[[deps.MozillaCACerts_jll]]
|
||||
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
|
||||
version = "2022.10.11"
|
||||
|
||||
[[NEO_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
|
||||
git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c"
|
||||
[[deps.NEO_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
|
||||
git-tree-sha1 = "9846d87fd254cdaa1879dff93999e1bc32ed2658"
|
||||
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
|
||||
version = "21.44.21506+0"
|
||||
version = "23.17.26241+0"
|
||||
|
||||
[[NetworkOptions]]
|
||||
[[deps.NetworkOptions]]
|
||||
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
|
||||
version = "1.2.0"
|
||||
|
||||
[[OpenLibm_jll]]
|
||||
[[deps.OpenBLAS_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
|
||||
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
|
||||
version = "0.3.21+4"
|
||||
|
||||
[[deps.OpenLibm_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
|
||||
version = "0.8.1+0"
|
||||
|
||||
[[OpenSpecFun_jll]]
|
||||
[[deps.OpenSpecFun_jll]]
|
||||
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
|
||||
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
||||
version = "0.5.5+0"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
|
||||
[[deps.OrderedCollections]]
|
||||
git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.4.1"
|
||||
version = "1.6.2"
|
||||
|
||||
[[Parameters]]
|
||||
[[deps.Parameters]]
|
||||
deps = ["OrderedCollections", "UnPack"]
|
||||
git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe"
|
||||
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
version = "0.12.3"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
[[deps.Pkg]]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
|
||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
version = "1.9.2"
|
||||
|
||||
[[Preferences]]
|
||||
[[deps.PrecompileTools]]
|
||||
deps = ["Preferences"]
|
||||
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
|
||||
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
|
||||
version = "1.2.0"
|
||||
|
||||
[[deps.Preferences]]
|
||||
deps = ["TOML"]
|
||||
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
|
||||
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
|
||||
uuid = "21216c6a-2e73-6563-6e65-726566657250"
|
||||
version = "1.2.2"
|
||||
version = "1.4.1"
|
||||
|
||||
[[Printf]]
|
||||
[[deps.Printf]]
|
||||
deps = ["Unicode"]
|
||||
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||
|
||||
[[REPL]]
|
||||
[[deps.REPL]]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
|
||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||
|
||||
[[Random]]
|
||||
deps = ["Serialization"]
|
||||
[[deps.Random]]
|
||||
deps = ["SHA", "Serialization"]
|
||||
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
|
||||
[[SHA]]
|
||||
[[deps.Reexport]]
|
||||
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
|
||||
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
|
||||
version = "1.2.2"
|
||||
|
||||
[[deps.Requires]]
|
||||
deps = ["UUIDs"]
|
||||
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
|
||||
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
||||
version = "1.3.0"
|
||||
|
||||
[[deps.SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
version = "0.7.0"
|
||||
|
||||
[[SPIRV_LLVM_Translator_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
|
||||
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
|
||||
version = "11.0.0+2"
|
||||
[[deps.SPIRV_LLVM_Translator_unified_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "fe95f28a96975bd1d473e9273873b36402b79a54"
|
||||
uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
|
||||
version = "0.3.0+0"
|
||||
|
||||
[[SPIRV_Tools_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
|
||||
[[deps.SPIRV_Tools_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl"]
|
||||
git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c"
|
||||
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
|
||||
version = "2021.2.0+0"
|
||||
version = "2023.2.0+0"
|
||||
|
||||
[[Serialization]]
|
||||
[[deps.Scratch]]
|
||||
deps = ["Dates"]
|
||||
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
|
||||
uuid = "6c6a2e73-6563-6170-7368-637461726353"
|
||||
version = "1.2.0"
|
||||
|
||||
[[deps.Serialization]]
|
||||
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
||||
|
||||
[[SharedArrays]]
|
||||
deps = ["Distributed", "Mmap", "Random", "Serialization"]
|
||||
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
|
||||
|
||||
[[Sockets]]
|
||||
[[deps.Sockets]]
|
||||
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
||||
|
||||
[[SparseArrays]]
|
||||
deps = ["LinearAlgebra", "Random"]
|
||||
[[deps.SparseArrays]]
|
||||
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
|
||||
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||
|
||||
[[SpecialFunctions]]
|
||||
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
|
||||
git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150"
|
||||
[[deps.SpecialFunctions]]
|
||||
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
|
||||
git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d"
|
||||
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||
version = "1.8.1"
|
||||
version = "2.3.1"
|
||||
|
||||
[[Statistics]]
|
||||
[deps.SpecialFunctions.extensions]
|
||||
SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
|
||||
|
||||
[deps.SpecialFunctions.weakdeps]
|
||||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
|
||||
|
||||
[[deps.StaticArrays]]
|
||||
deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
|
||||
git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2"
|
||||
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
version = "1.6.4"
|
||||
weakdeps = ["Statistics"]
|
||||
|
||||
[deps.StaticArrays.extensions]
|
||||
StaticArraysStatisticsExt = "Statistics"
|
||||
|
||||
[[deps.StaticArraysCore]]
|
||||
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
|
||||
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
|
||||
version = "1.4.2"
|
||||
|
||||
[[deps.Statistics]]
|
||||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||
version = "1.9.0"
|
||||
|
||||
[[TOML]]
|
||||
[[deps.SuiteSparse_jll]]
|
||||
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
|
||||
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
|
||||
version = "5.10.1+6"
|
||||
|
||||
[[deps.TOML]]
|
||||
deps = ["Dates"]
|
||||
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||
version = "1.0.3"
|
||||
|
||||
[[Tar]]
|
||||
[[deps.Tar]]
|
||||
deps = ["ArgTools", "SHA"]
|
||||
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
|
||||
version = "1.10.0"
|
||||
|
||||
[[Test]]
|
||||
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
|
||||
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[TextWrap]]
|
||||
[[deps.TextWrap]]
|
||||
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
|
||||
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
|
||||
version = "1.0.1"
|
||||
|
||||
[[TimerOutputs]]
|
||||
[[deps.TimerOutputs]]
|
||||
deps = ["ExprTools", "Printf"]
|
||||
git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc"
|
||||
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
|
||||
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
|
||||
version = "0.5.13"
|
||||
version = "0.5.23"
|
||||
|
||||
[[UUIDs]]
|
||||
[[deps.UUIDs]]
|
||||
deps = ["Random", "SHA"]
|
||||
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
||||
|
||||
[[UnPack]]
|
||||
[[deps.UnPack]]
|
||||
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
|
||||
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
|
||||
version = "1.0.2"
|
||||
|
||||
[[Unicode]]
|
||||
[[deps.Unicode]]
|
||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||
|
||||
[[Zlib_jll]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
|
||||
[[gmmlib_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd"
|
||||
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
|
||||
version = "21.2.1+0"
|
||||
|
||||
[[libigc_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3"
|
||||
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
|
||||
version = "1.0.8744+0"
|
||||
|
||||
[[nghttp2_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||
|
||||
[[oneAPI]]
|
||||
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"]
|
||||
git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce"
|
||||
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
|
||||
[[deps.UnsafeAtomics]]
|
||||
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
|
||||
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
|
||||
version = "0.2.1"
|
||||
|
||||
[[oneAPI_Level_Zero_Headers_jll]]
|
||||
[[deps.UnsafeAtomicsLLVM]]
|
||||
deps = ["LLVM", "UnsafeAtomics"]
|
||||
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
|
||||
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
|
||||
version = "0.1.3"
|
||||
|
||||
[[deps.Zlib_jll]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
version = "1.2.13+0"
|
||||
|
||||
[[deps.gmmlib_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464"
|
||||
git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc"
|
||||
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
|
||||
version = "22.3.0+0"
|
||||
|
||||
[[deps.libblastrampoline_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
|
||||
version = "5.8.0+0"
|
||||
|
||||
[[deps.libigc_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
|
||||
git-tree-sha1 = "7c0b5fa2ff90d96af106fd4a67ff6923cd3f9cb9"
|
||||
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
|
||||
version = "1.0.13822+0"
|
||||
|
||||
[[deps.nghttp2_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||
version = "1.48.0+0"
|
||||
|
||||
[[deps.oneAPI]]
|
||||
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LinearAlgebra", "NEO_jll", "Preferences", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "UnsafeAtomicsLLVM", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"]
|
||||
git-tree-sha1 = "9e6a675faf3ea27d08018c9bd0a03596003ff5cf"
|
||||
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
|
||||
version = "1.3.0"
|
||||
|
||||
[[deps.oneAPI_Level_Zero_Headers_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f"
|
||||
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
|
||||
version = "1.2.43+0"
|
||||
version = "1.6.3+0"
|
||||
|
||||
[[oneAPI_Level_Zero_Loader_jll]]
|
||||
[[deps.oneAPI_Level_Zero_Loader_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
|
||||
git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889"
|
||||
git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638"
|
||||
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
|
||||
version = "1.5.0+0"
|
||||
version = "1.11.0+0"
|
||||
|
||||
[[p7zip_jll]]
|
||||
[[deps.oneAPI_Support_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"]
|
||||
git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219"
|
||||
uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
|
||||
version = "0.2.2+0"
|
||||
|
||||
[[deps.p7zip_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
|
||||
version = "17.4.0+0"
|
||||
|
||||
@ -4,4 +4,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
|
||||
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
|
||||
|
||||
[compat]
|
||||
julia = "1.6"
|
||||
julia = "1.9"
|
||||
|
||||
@ -20,6 +20,18 @@ end
|
||||
|
||||
@enum Benchmark All Triad Nstream
|
||||
|
||||
|
||||
function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C}
|
||||
return @elapsed init_arrays!(data, context, init)
|
||||
end
|
||||
|
||||
function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C}
|
||||
elapsed = @elapsed begin
|
||||
result = read_data(data, context)
|
||||
end
|
||||
return (elapsed, result)
|
||||
end
|
||||
|
||||
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
|
||||
timings = Timings(times)
|
||||
lastSum::T = 0
|
||||
@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {
|
||||
end
|
||||
end
|
||||
|
||||
function run_nstream!(
|
||||
data::StreamData{T,C},
|
||||
context,
|
||||
times::Int,
|
||||
)::Vector{Float64} where {T,C}
|
||||
function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C}
|
||||
timings::Vector{Float64} = zeros(times)
|
||||
for i = 1:times
|
||||
@inbounds timings[i] = @elapsed nstream!(data, context)
|
||||
@ -93,9 +101,7 @@ function check_solutions(
|
||||
error = abs((dot - gold_sum) / gold_sum)
|
||||
failed = error > 1.0e-8
|
||||
if failed
|
||||
println(
|
||||
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
|
||||
)
|
||||
println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum")
|
||||
end
|
||||
!failed
|
||||
end : true
|
||||
@ -158,7 +164,7 @@ end
|
||||
|
||||
const DefaultInit = (0.1, 0.2, 0.0)
|
||||
const DefaultScalar = 0.4
|
||||
const Version = "4.0"
|
||||
const Version = "5.0"
|
||||
|
||||
function main()
|
||||
|
||||
@ -166,7 +172,7 @@ function main()
|
||||
parse_options(config)
|
||||
|
||||
if config.list
|
||||
for (i, (_,repr, impl)) in enumerate(devices())
|
||||
for (i, (_, repr, impl)) in enumerate(devices())
|
||||
println("[$i] ($impl) $repr")
|
||||
end
|
||||
exit(0)
|
||||
@ -175,9 +181,7 @@ function main()
|
||||
ds = devices()
|
||||
# TODO implement substring device match
|
||||
if config.device < 1 || config.device > length(ds)
|
||||
error(
|
||||
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
|
||||
)
|
||||
error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed")
|
||||
else
|
||||
device = ds[config.device]
|
||||
end
|
||||
@ -220,10 +224,10 @@ function main()
|
||||
end
|
||||
|
||||
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
|
||||
tail = Base.rest(xs)
|
||||
min = Iterators.minimum(tail)
|
||||
max = Iterators.maximum(tail)
|
||||
avg = Iterators.sum(tail) / Iterators.length(tail)
|
||||
tail = Iterators.rest(xs)
|
||||
min = Base.minimum(tail)
|
||||
max = Base.maximum(tail)
|
||||
avg = Base.sum(tail) / Base.length(tail)
|
||||
mbps = mega_scale * total_bytes / min
|
||||
if config.csv
|
||||
return [
|
||||
@ -257,16 +261,42 @@ function main()
|
||||
end
|
||||
end
|
||||
|
||||
function show_init(init::Float64, read::Float64)
|
||||
setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)]
|
||||
if config.csv
|
||||
tabulate(
|
||||
map(
|
||||
x -> [
|
||||
("phase", x[1]),
|
||||
("n_elements", config.arraysize),
|
||||
("sizeof", x[3]),
|
||||
("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]),
|
||||
("runtime", x[2]),
|
||||
],
|
||||
setup,
|
||||
)...,
|
||||
)
|
||||
else
|
||||
for (name, elapsed, total_bytes) in setup
|
||||
println(
|
||||
"$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)",
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
init::Tuple{type,type,type} = DefaultInit
|
||||
scalar::type = DefaultScalar
|
||||
|
||||
GC.enable(false)
|
||||
|
||||
(data, context) = make_stream(config.arraysize, scalar, device, config.csv)
|
||||
init_arrays!(data, context, init)
|
||||
tInit = run_init_arrays!(data, context, init)
|
||||
if benchmark == All
|
||||
(timings, sum) = run_all!(data, context, config.numtimes)
|
||||
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum)
|
||||
(tRead, result) = run_read_data(data, context)
|
||||
show_init(tInit, tRead)
|
||||
valid = check_solutions(result, config.numtimes, init, benchmark, sum)
|
||||
tabulate(
|
||||
mk_row(timings.copy, "Copy", 2 * array_bytes),
|
||||
mk_row(timings.mul, "Mul", 2 * array_bytes),
|
||||
@ -276,13 +306,15 @@ function main()
|
||||
)
|
||||
elseif benchmark == Nstream
|
||||
timings = run_nstream!(data, context, config.numtimes)
|
||||
valid =
|
||||
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
|
||||
(tRead, result) = run_read_data(data, context)
|
||||
show_init(tInit, tRead)
|
||||
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
|
||||
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
|
||||
elseif benchmark == Triad
|
||||
elapsed = run_triad!(data, context, config.numtimes)
|
||||
valid =
|
||||
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
|
||||
(tRead, result) = run_read_data(data, context)
|
||||
show_init(tInit, tRead)
|
||||
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
|
||||
total_bytes = 3 * array_bytes * config.numtimes
|
||||
bandwidth = mega_scale * (total_bytes / elapsed)
|
||||
println("Runtime (seconds): $(round(elapsed; digits=5))")
|
||||
@ -290,7 +322,6 @@ function main()
|
||||
else
|
||||
error("Bad benchmark $(benchmark)")
|
||||
end
|
||||
|
||||
GC.enable(true)
|
||||
|
||||
if !valid
|
||||
|
||||
@ -3,5 +3,6 @@
|
||||
|
||||
for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions"
|
||||
do
|
||||
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
|
||||
done
|
||||
echo "Updating subproject $BACKEND"
|
||||
julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();'
|
||||
done
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
|
||||
// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
|
||||
// University of Bristol HPC
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
@ -14,9 +14,9 @@ KokkosStream<T>::KokkosStream(
|
||||
{
|
||||
Kokkos::initialize();
|
||||
|
||||
d_a = new Kokkos::View<T*>("d_a", ARRAY_SIZE);
|
||||
d_b = new Kokkos::View<T*>("d_b", ARRAY_SIZE);
|
||||
d_c = new Kokkos::View<T*>("d_c", ARRAY_SIZE);
|
||||
d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
|
||||
d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
|
||||
d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
|
||||
hm_a = new typename Kokkos::View<T*>::HostMirror();
|
||||
hm_b = new typename Kokkos::View<T*>::HostMirror();
|
||||
hm_c = new typename Kokkos::View<T*>::HostMirror();
|
||||
@ -140,7 +140,7 @@ T KokkosStream<T>::dot()
|
||||
Kokkos::View<T*> a(*d_a);
|
||||
Kokkos::View<T*> b(*d_b);
|
||||
|
||||
T sum = 0.0;
|
||||
T sum{};
|
||||
|
||||
Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp)
|
||||
{
|
||||
|
||||
@ -10,9 +10,6 @@
|
||||
#include <stdexcept>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "Kokkos"
|
||||
|
||||
@ -1,32 +1,38 @@
|
||||
|
||||
register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection and RAJA.
|
||||
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
|
||||
"c++")
|
||||
|
||||
register_flag_required(KOKKOS_IN_TREE
|
||||
register_flag_optional(KOKKOS_IN_TREE
|
||||
"Absolute path to the *source* distribution directory of Kokkos.
|
||||
Remember to append Kokkos specific flags as well, for example:
|
||||
|
||||
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
|
||||
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
|
||||
|
||||
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
|
||||
register_flag_optional(KOKKOS_IN_PACKAGE
|
||||
"Absolute path to package R-Path containing Kokkos libs.
|
||||
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
|
||||
|
||||
# compiler vendor and arch specific flags
|
||||
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
|
||||
|
||||
macro(setup)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
|
||||
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
|
||||
|
||||
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
|
||||
|
||||
if (EXISTS "${KOKKOS_IN_TREE}")
|
||||
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
|
||||
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
|
||||
register_link_library(Kokkos::kokkos)
|
||||
else ()
|
||||
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
|
||||
elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
|
||||
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
|
||||
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
|
||||
find_package(Kokkos REQUIRED)
|
||||
register_link_library(Kokkos::kokkos)
|
||||
else()
|
||||
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
|
||||
endif ()
|
||||
|
||||
register_append_compiler_and_arch_specific_cxx_flags(
|
||||
@ -36,5 +42,3 @@ macro(setup)
|
||||
)
|
||||
|
||||
endmacro()
|
||||
|
||||
|
||||
|
||||
85
src/main.cpp
85
src/main.cpp
@ -15,7 +15,7 @@
|
||||
#include <iomanip>
|
||||
#include <cstring>
|
||||
|
||||
#define VERSION_STRING "4.0"
|
||||
#define VERSION_STRING "5.0"
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
@ -49,6 +49,8 @@
|
||||
#include "SYCLStream2020.h"
|
||||
#elif defined(OMP)
|
||||
#include "OMPStream.h"
|
||||
#elif defined(FUTHARK)
|
||||
#include "FutharkStream.h"
|
||||
#endif
|
||||
|
||||
// Default size of 2^25
|
||||
@ -222,10 +224,10 @@ void run()
|
||||
{
|
||||
// MiB = 2^20
|
||||
std::cout << std::setprecision(1) << std::fixed
|
||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
|
||||
<< " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
|
||||
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
|
||||
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
|
||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
|
||||
<< " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
|
||||
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
|
||||
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -298,12 +300,18 @@ void run()
|
||||
// Use the OpenMP implementation
|
||||
stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
#elif defined(FUTHARK)
|
||||
// Use the Futhark implementation
|
||||
stream = new FutharkStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
#endif
|
||||
|
||||
auto init1 = std::chrono::high_resolution_clock::now();
|
||||
stream->init_arrays(startA, startB, startC);
|
||||
auto init2 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Result of the Dot kernel, if used.
|
||||
T sum = 0.0;
|
||||
T sum{};
|
||||
|
||||
std::vector<std::vector<double>> timings;
|
||||
|
||||
@ -327,7 +335,54 @@ void run()
|
||||
std::vector<T> c(ARRAY_SIZE);
|
||||
|
||||
|
||||
auto read1 = std::chrono::high_resolution_clock::now();
|
||||
stream->read_arrays(a, b, c);
|
||||
auto read2 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto initElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(read2 - read1).count();
|
||||
auto readElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(init2 - init1).count();
|
||||
auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS;
|
||||
auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS;
|
||||
|
||||
if (output_as_csv)
|
||||
{
|
||||
std::cout
|
||||
<< "phase" << csv_separator
|
||||
<< "n_elements" << csv_separator
|
||||
<< "sizeof" << csv_separator
|
||||
<< ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
|
||||
<< "runtime" << std::endl;
|
||||
std::cout
|
||||
<< "Init" << csv_separator
|
||||
<< ARRAY_SIZE << csv_separator
|
||||
<< sizeof(T) << csv_separator
|
||||
<< initBWps << csv_separator
|
||||
<< initElapsedS << std::endl;
|
||||
std::cout
|
||||
<< "Read" << csv_separator
|
||||
<< ARRAY_SIZE << csv_separator
|
||||
<< sizeof(T) << csv_separator
|
||||
<< readBWps << csv_separator
|
||||
<< readElapsedS << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Init: "
|
||||
<< std::setw(7)
|
||||
<< initElapsedS
|
||||
<< " s (="
|
||||
<< initBWps
|
||||
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
|
||||
<< ")" << std::endl;
|
||||
std::cout << "Read: "
|
||||
<< std::setw(7)
|
||||
<< readElapsedS
|
||||
<< " s (="
|
||||
<< readBWps
|
||||
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
|
||||
<< ")" << std::endl;
|
||||
}
|
||||
|
||||
check_solution<T>(num_times, a, b, c, sum);
|
||||
|
||||
// Display timing results
|
||||
@ -393,7 +448,7 @@ void run()
|
||||
<< num_times << csv_separator
|
||||
<< ARRAY_SIZE << csv_separator
|
||||
<< sizeof(T) << csv_separator
|
||||
<< ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
|
||||
<< ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
|
||||
<< *minmax.first << csv_separator
|
||||
<< *minmax.second << csv_separator
|
||||
<< average
|
||||
@ -404,7 +459,7 @@ void run()
|
||||
std::cout
|
||||
<< std::left << std::setw(12) << labels[i]
|
||||
<< std::left << std::setw(12) << std::setprecision(3) <<
|
||||
((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
|
||||
((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
|
||||
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first
|
||||
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second
|
||||
<< std::left << std::setw(12) << std::setprecision(5) << average
|
||||
@ -415,7 +470,7 @@ void run()
|
||||
{
|
||||
// Display timing results
|
||||
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
|
||||
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
|
||||
double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
|
||||
|
||||
if (output_as_csv)
|
||||
{
|
||||
@ -461,7 +516,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
|
||||
T goldA = startA;
|
||||
T goldB = startB;
|
||||
T goldC = startC;
|
||||
T goldSum = 0.0;
|
||||
T goldSum{};
|
||||
|
||||
const T scalar = startScalar;
|
||||
|
||||
@ -487,15 +542,15 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
|
||||
goldSum = goldA * goldB * ARRAY_SIZE;
|
||||
|
||||
// Calculate the average error
|
||||
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
|
||||
long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); });
|
||||
errA /= a.size();
|
||||
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
|
||||
long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); });
|
||||
errB /= b.size();
|
||||
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
|
||||
long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); });
|
||||
errC /= c.size();
|
||||
double errSum = fabs((sum - goldSum)/goldSum);
|
||||
long double errSum = std::fabs((sum - goldSum)/goldSum);
|
||||
|
||||
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
|
||||
long double epsi = std::numeric_limits<T>::epsilon() * 100.0;
|
||||
|
||||
if (errA > epsi)
|
||||
std::cerr
|
||||
|
||||
@ -260,7 +260,7 @@ T OCLStream<T>::dot()
|
||||
);
|
||||
cl::copy(queue, d_sum, sums.begin(), sums.end());
|
||||
|
||||
T sum = 0.0;
|
||||
T sum{};
|
||||
for (T val : sums)
|
||||
sum += val;
|
||||
|
||||
|
||||
@ -220,7 +220,7 @@ void OMPStream<T>::nstream()
|
||||
template <class T>
|
||||
T OMPStream<T>::dot()
|
||||
{
|
||||
T sum = 0.0;
|
||||
T sum{};
|
||||
|
||||
#ifdef OMP_TARGET_GPU
|
||||
int array_size = this->array_size;
|
||||
|
||||
@ -131,7 +131,7 @@ T RAJAStream<T>::dot()
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
|
||||
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
|
||||
RAJA::ReduceSum<reduce_policy, T> sum(T{});
|
||||
|
||||
forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
|
||||
@ -1,25 +1,26 @@
|
||||
|
||||
register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection and RAJA.
|
||||
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
|
||||
"c++")
|
||||
|
||||
register_flag_required(RAJA_IN_TREE
|
||||
register_flag_optional(RAJA_IN_TREE
|
||||
"Absolute path to the *source* distribution directory of RAJA.
|
||||
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
|
||||
Remember to append RAJA specific flags as well, for example:
|
||||
|
||||
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
|
||||
|
||||
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
|
||||
")
|
||||
" "")
|
||||
|
||||
register_flag_optional(RAJA_IN_PACKAGE
|
||||
"Use if Raja is part of a package dependency:
|
||||
Path to installation" "")
|
||||
|
||||
register_flag_optional(TARGET
|
||||
"Target offload device, implemented values are CPU, NVIDIA"
|
||||
CPU)
|
||||
|
||||
register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
|
||||
"[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")
|
||||
"[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the RAJA_ENABLE_CUDA or ENABLE_CUDA flag is specified for RAJA" "")
|
||||
|
||||
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
|
||||
register_flag_optional(CUDA_ARCH
|
||||
@ -57,7 +58,20 @@ macro(setup)
|
||||
set(ENABLE_BENCHMARKS OFF CACHE BOOL "")
|
||||
set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE)
|
||||
|
||||
if (ENABLE_CUDA)
|
||||
# RAJA >= v2022.03.0 switched to prefixed variables, we keep the legacy ones for backwards compatibiity
|
||||
set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_REPRODUCERS OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_DOCUMENTATION OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
|
||||
set(RAJA_ENABLE_CUDA ${RAJA_ENABLE_CUDA} CACHE BOOL "" FORCE)
|
||||
|
||||
if (ENABLE_CUDA OR RAJA_ENABLE_CUDA)
|
||||
|
||||
# RAJA still needs ENABLE_CUDA for internal use, so if either is on, assert both.
|
||||
set(RAJA_ENABLE_CUDA ON)
|
||||
set(ENABLE_CUDA ON)
|
||||
|
||||
# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
|
||||
if(POLICY CMP0104)
|
||||
@ -69,6 +83,10 @@ macro(setup)
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
|
||||
list(APPEND CMAKE_CUDA_FLAGS)
|
||||
|
||||
# See https://github.com/LLNL/RAJA/pull/1302
|
||||
# And https://github.com/LLNL/RAJA/pull/1339
|
||||
set(RAJA_ENABLE_VECTORIZATION OFF CACHE BOOL "")
|
||||
|
||||
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
|
||||
endif ()
|
||||
|
||||
@ -76,8 +94,14 @@ macro(setup)
|
||||
register_link_library(RAJA)
|
||||
# RAJA's cmake screws with where the binary will end up, resetting it here:
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||
|
||||
elseif (EXISTS "${RAJA_IN_PACKAGE}")
|
||||
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
|
||||
find_package(RAJA REQUIRED)
|
||||
register_link_library(RAJA)
|
||||
|
||||
else ()
|
||||
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
|
||||
message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
|
||||
endif ()
|
||||
|
||||
|
||||
|
||||
805
src/rust/rust-stream/Cargo.lock
generated
805
src/rust/rust-stream/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,25 +1,25 @@
|
||||
[package]
|
||||
name = "rust-stream"
|
||||
version = "4.0.0"
|
||||
version = "5.0.0"
|
||||
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
num-traits = "0.2.14"
|
||||
structopt = "0.3.13"
|
||||
tabular = "0.1.4"
|
||||
rayon = "1.5.1"
|
||||
crossbeam = "0.8.1"
|
||||
num_cpus = "1.13.0"
|
||||
rustversion = "1.0"
|
||||
libc = "0.2.97"
|
||||
num-traits = "0.2.15"
|
||||
structopt = "0.3.26"
|
||||
tabular = "0.2.0"
|
||||
rayon = "1.5.3"
|
||||
crossbeam = "0.8.2"
|
||||
num_cpus = "1.13.1"
|
||||
rustversion = "1.0.9"
|
||||
libc = "0.2.134"
|
||||
core_affinity = "0.5.10"
|
||||
colour = "0.6.0"
|
||||
|
||||
[dev-dependencies]
|
||||
rstest = "0.10.0"
|
||||
rstest = "0.13.0"
|
||||
|
||||
[build-dependencies]
|
||||
rustversion = "1.0"
|
||||
|
||||
@ -54,7 +54,7 @@ use_field_init_shorthand = false
|
||||
force_explicit_abi = true
|
||||
condense_wildcard_suffixes = false
|
||||
color = "Auto"
|
||||
required_version = "1.4.38"
|
||||
required_version = "1.6.0"
|
||||
unstable_features = false
|
||||
disable_all_formatting = false
|
||||
skip_children = false
|
||||
|
||||
@ -174,7 +174,7 @@ where StreamData<T, D, A>: RustStream<T> {
|
||||
);
|
||||
}
|
||||
|
||||
stream.init_arrays();
|
||||
let init = stream.run_init_arrays();
|
||||
|
||||
let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
|
||||
let tail = &xs[1..]; // tail only
|
||||
@ -235,10 +235,47 @@ where StreamData<T, D, A>: RustStream<T> {
|
||||
};
|
||||
};
|
||||
|
||||
let show_setup = |init: Duration, read: Duration| {
|
||||
let setup = vec![
|
||||
("Init", init.as_secs_f64(), 3 * array_bytes),
|
||||
("Read", read.as_secs_f64(), 3 * array_bytes),
|
||||
];
|
||||
if option.csv {
|
||||
tabulate_all(
|
||||
setup
|
||||
.iter()
|
||||
.map(|(name, elapsed, t_size)| {
|
||||
vec![
|
||||
("phase", name.to_string()),
|
||||
("n_elements", option.arraysize.to_string()),
|
||||
("sizeof", t_size.to_string()),
|
||||
(
|
||||
if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
|
||||
(mega_scale * (*t_size as f64) / elapsed).to_string(),
|
||||
),
|
||||
("runtime", elapsed.to_string()),
|
||||
]
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
} else {
|
||||
for (name, elapsed, t_size) in setup {
|
||||
println!(
|
||||
"{}: {:.5} s (={:.5} {})",
|
||||
name,
|
||||
elapsed,
|
||||
mega_scale * (t_size as f64) / elapsed,
|
||||
if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let solutions_correct = match benchmark {
|
||||
Benchmark::All => {
|
||||
let (results, sum) = stream.run_all(option.numtimes);
|
||||
stream.read_arrays();
|
||||
let read = stream.run_read_arrays();
|
||||
show_setup(init, read);
|
||||
let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum));
|
||||
tabulate_all(vec![
|
||||
tabulate(&results.copy, "Copy", 2 * array_bytes),
|
||||
@ -251,14 +288,16 @@ where StreamData<T, D, A>: RustStream<T> {
|
||||
}
|
||||
Benchmark::NStream => {
|
||||
let results = stream.run_nstream(option.numtimes);
|
||||
stream.read_arrays();
|
||||
let read = stream.run_read_arrays();
|
||||
show_setup(init, read);
|
||||
let correct = check_solution(benchmark, option.numtimes, &stream, None);
|
||||
tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
|
||||
correct
|
||||
}
|
||||
Benchmark::Triad => {
|
||||
let results = stream.run_triad(option.numtimes);
|
||||
stream.read_arrays();
|
||||
let read = stream.run_read_arrays();
|
||||
show_setup(init, read);
|
||||
let correct = check_solution(benchmark, option.numtimes, &stream, None);
|
||||
let total_bytes = 3 * array_bytes * option.numtimes;
|
||||
let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());
|
||||
|
||||
@ -132,6 +132,18 @@ pub trait RustStream<T: Default> {
|
||||
fn nstream(&mut self);
|
||||
fn dot(&mut self) -> T;
|
||||
|
||||
fn run_init_arrays(&mut self) -> Duration {
|
||||
timed(|| {
|
||||
self.init_arrays();
|
||||
})
|
||||
}
|
||||
|
||||
fn run_read_arrays(&mut self) -> Duration {
|
||||
timed(|| {
|
||||
self.read_arrays();
|
||||
})
|
||||
}
|
||||
|
||||
fn run_all(&mut self, n: usize) -> (AllTiming<Vec<Duration>>, T) {
|
||||
let mut timings: AllTiming<Vec<Duration>> = AllTiming {
|
||||
copy: vec![Duration::default(); n],
|
||||
|
||||
@ -2,10 +2,10 @@ use rstest::rstest;
|
||||
|
||||
#[rstest]
|
||||
fn test_main(
|
||||
#[values(0, 1, 2, 3, 4)] device: usize, //
|
||||
#[values("", "--pin")] pin: &str, //
|
||||
#[values("", "--malloc")] malloc: &str, //
|
||||
#[values("", "--init")] init: &str, //
|
||||
#[values(0, 1, 2, 3, 4)] device: usize, //
|
||||
#[values("", "--pin")] pin: &str, //
|
||||
#[values("", "--malloc")] malloc: &str, //
|
||||
#[values("", "--init")] init: &str, //
|
||||
#[values("", "--triad-only", "--nstream-only")] option: &str, //
|
||||
) {
|
||||
let line = format!(
|
||||
|
||||
@ -1 +0,0 @@
|
||||
{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]}
|
||||
1
src/scala/scala-stream/.gitignore
vendored
1
src/scala/scala-stream/.gitignore
vendored
@ -1 +1,2 @@
|
||||
target/
|
||||
.bsp/
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
version = "3.0.0-RC2"
|
||||
version = "3.7.14"
|
||||
runner.dialect = scala3
|
||||
|
||||
style = defaultWithAlign
|
||||
|
||||
@ -3,14 +3,19 @@ lazy val mainCls = Some("scalastream.App")
|
||||
lazy val root = (project in file("."))
|
||||
.enablePlugins(NativeImagePlugin)
|
||||
.settings(
|
||||
scalaVersion := "3.0.0",
|
||||
version := "4.0",
|
||||
scalaVersion := "3.3.1",
|
||||
version := "5.0",
|
||||
organization := "uk.ac.bristol.uob-hpc",
|
||||
organizationName := "University of Bristol",
|
||||
Compile / mainClass := mainCls,
|
||||
assembly / mainClass := mainCls,
|
||||
scalacOptions ~= filterConsoleScalacOptions,
|
||||
assembly / assemblyJarName := "scala-stream.jar",
|
||||
assembly / assemblyMergeStrategy := {
|
||||
case PathList("module-info.class") => MergeStrategy.discard
|
||||
case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard
|
||||
case x => (ThisBuild / assemblyMergeStrategy).value(x)
|
||||
},
|
||||
nativeImageOptions := Seq(
|
||||
"--no-fallback",
|
||||
"-H:ReflectionConfigurationFiles=../../reflect-config.json"
|
||||
@ -22,8 +27,8 @@ lazy val root = (project in file("."))
|
||||
// Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part
|
||||
("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13),
|
||||
// par also uses lazy val at some point, so it doesn't work in nativeImage
|
||||
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3",
|
||||
"net.openhft" % "affinity" % "3.21ea1",
|
||||
"org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity
|
||||
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
|
||||
"net.openhft" % "affinity" % "3.23.2",
|
||||
"org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity
|
||||
)
|
||||
)
|
||||
|
||||
@ -1 +1 @@
|
||||
sbt.version=1.5.2
|
||||
sbt.version=1.9.2
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3")
|
||||
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17")
|
||||
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20")
|
||||
addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0")
|
||||
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
|
||||
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3")
|
||||
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27")
|
||||
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2")
|
||||
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3")
|
||||
|
||||
@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
|
||||
def config: Config[A]
|
||||
|
||||
def initArrays(): Unit
|
||||
def readArrays(): Unit = ()
|
||||
def copy(): Unit
|
||||
def mul(): Unit
|
||||
def add(): Unit
|
||||
@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
|
||||
val end = System.nanoTime()
|
||||
FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r
|
||||
|
||||
inline def runInitArrays(): FiniteDuration = timed(initArrays())._1
|
||||
inline def runReadArrays(): FiniteDuration = timed(readArrays())._1
|
||||
inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) =
|
||||
val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
|
||||
val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
|
||||
@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
|
||||
|
||||
def data(): Data[A]
|
||||
|
||||
|
||||
trait Fractional[@specialized(Double, Float) A]:
|
||||
def toFractional(f: Float): A
|
||||
def toFractional(f: Double): A
|
||||
@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]:
|
||||
extension (x: Int) inline def fractional = toFractional(x.toFloat)
|
||||
extension (x: Long) inline def fractional = toFractional(x.toDouble)
|
||||
extension (x: A)
|
||||
inline def +(y: A) = add(x, y)
|
||||
inline def -(y: A) = sub(x, y)
|
||||
inline def *(y: A) = mul(x, y)
|
||||
inline def /(y: A) = div(x, y)
|
||||
inline def >(y: A) = compare(x, y) > 0
|
||||
inline def <(y: A) = compare(x, y) < 0
|
||||
inline def abs_ = abs(x)
|
||||
inline def +(y: A) = add(x, y)
|
||||
inline def -(y: A) = sub(x, y)
|
||||
inline def *(y: A) = mul(x, y)
|
||||
inline def /(y: A) = div(x, y)
|
||||
inline def >(y: A) = compare(x, y) > 0
|
||||
inline def <(y: A) = compare(x, y) < 0
|
||||
inline def abs_ = abs(x)
|
||||
end Fractional
|
||||
|
||||
given FloatFractional: Fractional[Float] with
|
||||
@ -108,7 +110,7 @@ given DoubleFractional: Fractional[Double] with
|
||||
|
||||
object App:
|
||||
|
||||
final val Version: String = "4.0"
|
||||
final val Version: String = "5.0"
|
||||
|
||||
case class Config[@specialized(Double, Float) A](
|
||||
options: Options,
|
||||
@ -204,7 +206,7 @@ object App:
|
||||
validateXs("c", vec.c, goldC)
|
||||
|
||||
dotSum.foreach { sum =>
|
||||
val goldSum = (goldA * goldB) * (config.options.arraysize).fractional
|
||||
val goldSum = (goldA * goldB) * config.options.arraysize.fractional
|
||||
val error = ((sum - goldSum) / goldSum).abs_
|
||||
if error > 1.fractional / 100000000.fractional then
|
||||
Console.err.println(
|
||||
@ -238,10 +240,10 @@ object App:
|
||||
)
|
||||
|
||||
println(s"Running ${config.benchmark match {
|
||||
case Benchmark.All => "kernels"
|
||||
case Benchmark.Triad => "triad"
|
||||
case Benchmark.NStream => "nstream"
|
||||
}} ${opt.numtimes} times")
|
||||
case Benchmark.All => "kernels"
|
||||
case Benchmark.Triad => "triad"
|
||||
case Benchmark.NStream => "nstream"
|
||||
}} ${opt.numtimes} times")
|
||||
|
||||
if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}")
|
||||
|
||||
@ -288,11 +290,38 @@ object App:
|
||||
println(header.map(_._1.padTo(padding, ' ')).mkString(sep))
|
||||
println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n"))
|
||||
|
||||
def showInit(init: FiniteDuration, read: FiniteDuration): Unit = {
|
||||
val setup =
|
||||
Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes))
|
||||
if opt.csv then
|
||||
tabulate(
|
||||
setup.map((name, elapsed, totalBytes) =>
|
||||
Vector(
|
||||
"phase" -> name,
|
||||
"n_elements" -> opt.arraysize.toString,
|
||||
"sizeof" -> arrayBytes.toString,
|
||||
s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" ->
|
||||
(megaScale * totalBytes.toDouble / elapsed).toString,
|
||||
"runtime" -> elapsed.toString
|
||||
)
|
||||
): _*
|
||||
)
|
||||
else
|
||||
for (name, elapsed, totalBytes) <- setup do
|
||||
println(
|
||||
f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${
|
||||
if opt.mibibytes then "i" else ""
|
||||
}Bytes/sec)"
|
||||
)
|
||||
}
|
||||
|
||||
val stream = mkStream(config)
|
||||
stream.initArrays()
|
||||
val init = stream.runInitArrays()
|
||||
config.benchmark match
|
||||
case Benchmark.All =>
|
||||
val (results, sum) = stream.runAll(opt.numtimes)
|
||||
val read = stream.runReadArrays()
|
||||
showInit(init, read)
|
||||
validate(stream.data(), config, Some(sum))
|
||||
tabulate(
|
||||
mkRow(results.copy, "Copy", 2 * arrayBytes),
|
||||
@ -303,10 +332,14 @@ object App:
|
||||
)
|
||||
case Benchmark.NStream =>
|
||||
val result = stream.runNStream(opt.numtimes)
|
||||
val read = stream.runReadArrays()
|
||||
showInit(init, read)
|
||||
validate(stream.data(), config)
|
||||
tabulate(mkRow(result, "Nstream", 4 * arrayBytes))
|
||||
case Benchmark.Triad =>
|
||||
val results = stream.runTriad(opt.numtimes)
|
||||
val results = stream.runTriad(opt.numtimes)
|
||||
val read = stream.runReadArrays()
|
||||
showInit(init, read)
|
||||
val totalBytes = 3 * arrayBytes * opt.numtimes
|
||||
val bandwidth = megaScale * (totalBytes / results.seconds)
|
||||
println(f"Runtime (seconds): ${results.seconds}%.5f")
|
||||
|
||||
@ -6,64 +6,76 @@
|
||||
|
||||
#include "STDDataStream.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <execution>
|
||||
#include <numeric>
|
||||
|
||||
// There are three execution policies:
|
||||
// auto exe_policy = std::execution::seq;
|
||||
// auto exe_policy = std::execution::par;
|
||||
auto exe_policy = std::execution::par_unseq;
|
||||
|
||||
|
||||
template <class T>
|
||||
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
|
||||
noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
|
||||
noexcept : array_size{ARRAY_SIZE},
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
{
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
std::cout << "Using oneDPL backend: ";
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
|
||||
#elif ONEDPL_USE_TBB_BACKEND
|
||||
std::cout << "TBB " TBB_VERSION_STRING;
|
||||
#elif ONEDPL_USE_OPENMP_BACKEND
|
||||
std::cout << "OpenMP";
|
||||
#else
|
||||
std::cout << "Default";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class T>
|
||||
STDDataStream<T>::~STDDataStream() {
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::fill(exe_policy, a.begin(), a.end(), initA);
|
||||
std::fill(exe_policy, b.begin(), b.end(), initB);
|
||||
std::fill(exe_policy, c.begin(), c.end(), initC);
|
||||
std::fill(exe_policy, a, a + array_size, initA);
|
||||
std::fill(exe_policy, b, b + array_size, initB);
|
||||
std::fill(exe_policy, c, c + array_size, initC);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
h_a = a;
|
||||
h_b = b;
|
||||
h_c = c;
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::copy()
|
||||
{
|
||||
// c[i] = a[i]
|
||||
std::copy(exe_policy, a.begin(), a.end(), c.begin());
|
||||
std::copy(exe_policy, a, a + array_size, c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::mul()
|
||||
{
|
||||
// b[i] = scalar * c[i];
|
||||
std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; });
|
||||
std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::add()
|
||||
{
|
||||
// c[i] = a[i] + b[i];
|
||||
std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>());
|
||||
std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::triad()
|
||||
{
|
||||
// a[i] = b[i] + scalar * c[i];
|
||||
std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
|
||||
std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
@ -73,8 +85,8 @@ void STDDataStream<T>::nstream()
|
||||
// Need to do in two stages with C++11 STL.
|
||||
// 1: a[i] += b[i]
|
||||
// 2: a[i] += scalar * c[i];
|
||||
std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; });
|
||||
std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
|
||||
std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
|
||||
std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
|
||||
}
|
||||
|
||||
|
||||
@ -82,7 +94,7 @@ template <class T>
|
||||
T STDDataStream<T>::dot()
|
||||
{
|
||||
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
|
||||
return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -101,4 +113,3 @@ std::string getDeviceDriver(const int)
|
||||
}
|
||||
template class STDDataStream<float>;
|
||||
template class STDDataStream<double>;
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
// source code
|
||||
|
||||
#pragma once
|
||||
#include "dpl_shim.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
@ -21,14 +22,11 @@ class STDDataStream : public Stream<T>
|
||||
int array_size;
|
||||
|
||||
// Device side pointers
|
||||
std::vector<T> a;
|
||||
std::vector<T> b;
|
||||
std::vector<T> c;
|
||||
|
||||
T *a, *b, *c;
|
||||
|
||||
public:
|
||||
STDDataStream(const int, int) noexcept;
|
||||
~STDDataStream() = default;
|
||||
~STDDataStream();
|
||||
|
||||
virtual void copy() override;
|
||||
virtual void add() override;
|
||||
|
||||
@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
|
||||
ccall - Compile for all supported compute capabilities"
|
||||
"")
|
||||
|
||||
register_flag_optional(USE_TBB
|
||||
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
|
||||
"OFF")
|
||||
|
||||
register_flag_optional(USE_ONEDPL
|
||||
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
|
||||
|
||||
Possible values are:
|
||||
OPENMP - Implements policies using OpenMP.
|
||||
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
|
||||
TBB - Implements policies using TBB.
|
||||
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
|
||||
DPCPP - Implements policies through SYCL2020.
|
||||
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
|
||||
"OFF")
|
||||
|
||||
macro(setup)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
if (NVHPC_OFFLOAD)
|
||||
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
|
||||
# propagate flags to linker so that it links with the gpu stuff as well
|
||||
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
|
||||
register_append_link_flags(${NVHPC_FLAGS})
|
||||
endif ()
|
||||
|
||||
|
||||
if (USE_TBB)
|
||||
register_link_library(TBB::tbb)
|
||||
endif ()
|
||||
if (USE_ONEDPL)
|
||||
register_definitions(USE_ONEDPL)
|
||||
register_link_library(oneDPL)
|
||||
endif ()
|
||||
endmacro()
|
||||
|
||||
@ -6,50 +6,66 @@
|
||||
|
||||
#include "STDIndicesStream.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <execution>
|
||||
#include <numeric>
|
||||
|
||||
// There are three execution policies:
|
||||
// auto exe_policy = std::execution::seq;
|
||||
// auto exe_policy = std::execution::par;
|
||||
auto exe_policy = std::execution::par_unseq;
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
|
||||
noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size)
|
||||
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
{
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
std::cout << "Using oneDPL backend: ";
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
|
||||
#elif ONEDPL_USE_TBB_BACKEND
|
||||
std::cout << "TBB " TBB_VERSION_STRING;
|
||||
#elif ONEDPL_USE_OPENMP_BACKEND
|
||||
std::cout << "OpenMP";
|
||||
#else
|
||||
std::cout << "Default";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class T>
|
||||
STDIndicesStream<T>::~STDIndicesStream() {
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::fill(exe_policy, a.begin(), a.end(), initA);
|
||||
std::fill(exe_policy, b.begin(), b.end(), initB);
|
||||
std::fill(exe_policy, c.begin(), c.end(), initC);
|
||||
std::fill(exe_policy, a, a + array_size, initA);
|
||||
std::fill(exe_policy, b, b + array_size, initB);
|
||||
std::fill(exe_policy, c, c + array_size, initC);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
h_a = a;
|
||||
h_b = b;
|
||||
h_c = c;
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::copy()
|
||||
{
|
||||
// c[i] = a[i]
|
||||
std::copy(exe_policy, a.begin(), a.end(), c.begin());
|
||||
std::copy(exe_policy, a, a + array_size, c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::mul()
|
||||
{
|
||||
// b[i] = scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
|
||||
return scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -58,7 +74,7 @@ template <class T>
|
||||
void STDIndicesStream<T>::add()
|
||||
{
|
||||
// c[i] = a[i] + b[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
|
||||
return a[i] + b[i];
|
||||
});
|
||||
}
|
||||
@ -67,7 +83,7 @@ template <class T>
|
||||
void STDIndicesStream<T>::triad()
|
||||
{
|
||||
// a[i] = b[i] + scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
return b[i] + scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -79,7 +95,7 @@ void STDIndicesStream<T>::nstream()
|
||||
// Need to do in two stages with C++11 STL.
|
||||
// 1: a[i] += b[i]
|
||||
// 2: a[i] += scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
return a[i] + b[i] + scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -89,7 +105,7 @@ template <class T>
|
||||
T STDIndicesStream<T>::dot()
|
||||
{
|
||||
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
|
||||
return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -108,4 +124,3 @@ std::string getDeviceDriver(const int)
|
||||
}
|
||||
template class STDIndicesStream<float>;
|
||||
template class STDIndicesStream<double>;
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
// source code
|
||||
|
||||
#pragma once
|
||||
#include "dpl_shim.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
@ -12,40 +13,57 @@
|
||||
|
||||
#define IMPLEMENTATION_STRING "STD (index-oriented)"
|
||||
|
||||
|
||||
// A lightweight counting iterator which will be used by the STL algorithms
|
||||
// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
|
||||
// implementation doesn't target
|
||||
template <typename N>
|
||||
class ranged {
|
||||
N from, to;
|
||||
public:
|
||||
ranged(N from, N to ): from(from), to(to) {}
|
||||
class iterator {
|
||||
N num;
|
||||
class iterator {
|
||||
friend class ranged;
|
||||
public:
|
||||
using difference_type = N;
|
||||
using value_type = N;
|
||||
using pointer = const N*;
|
||||
using reference = const N&;
|
||||
using iterator_category = std::random_access_iterator_tag;
|
||||
explicit iterator(N _num = 0) : num(_num) {}
|
||||
using difference_type = N;
|
||||
using value_type = N;
|
||||
using pointer = const N*;
|
||||
using reference = N;
|
||||
using iterator_category = std::random_access_iterator_tag;
|
||||
|
||||
iterator& operator++() { num++; return *this; }
|
||||
iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
|
||||
iterator operator+(const value_type v) const { return iterator(num + v); }
|
||||
// XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
|
||||
// Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
|
||||
// This type is unused for any nother STL impl.
|
||||
using is_passed_directly = std::true_type;
|
||||
|
||||
bool operator==(iterator other) const { return num == other.num; }
|
||||
bool operator!=(iterator other) const { return *this != other; }
|
||||
bool operator<(iterator other) const { return num < other.num; }
|
||||
reference operator *() const { return i_; }
|
||||
iterator &operator ++() { ++i_; return *this; }
|
||||
iterator operator ++(int) { iterator copy(*this); ++i_; return copy; }
|
||||
|
||||
reference operator*() const { return num;}
|
||||
difference_type operator-(const iterator &it) const { return num - it.num; }
|
||||
value_type operator[](const difference_type &i) const { return num + i; }
|
||||
iterator &operator --() { --i_; return *this; }
|
||||
iterator operator --(int) { iterator copy(*this); --i_; return copy; }
|
||||
|
||||
};
|
||||
iterator begin() { return iterator(from); }
|
||||
iterator end() { return iterator(to >= from? to+1 : to-1); }
|
||||
iterator &operator +=(N by) { i_+=by; return *this; }
|
||||
|
||||
value_type operator[](const difference_type &i) const { return i_ + i; }
|
||||
|
||||
difference_type operator-(const iterator &it) const { return i_ - it.i_; }
|
||||
iterator operator+(const value_type v) const { return iterator(i_ + v); }
|
||||
|
||||
bool operator ==(const iterator &other) const { return i_ == other.i_; }
|
||||
bool operator !=(const iterator &other) const { return i_ != other.i_; }
|
||||
bool operator < (const iterator &other) const { return i_ < other.i_; }
|
||||
|
||||
protected:
|
||||
explicit iterator(N start) : i_ (start) {}
|
||||
|
||||
private:
|
||||
N i_;
|
||||
};
|
||||
|
||||
[[nodiscard]] iterator begin() const { return begin_; }
|
||||
[[nodiscard]] iterator end() const { return end_; }
|
||||
ranged(N begin, N end) : begin_(begin), end_(end) {}
|
||||
private:
|
||||
iterator begin_;
|
||||
iterator end_;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
@ -59,14 +77,11 @@ class STDIndicesStream : public Stream<T>
|
||||
ranged<int> range;
|
||||
|
||||
// Device side pointers
|
||||
std::vector<T> a;
|
||||
std::vector<T> b;
|
||||
std::vector<T> c;
|
||||
|
||||
T *a, *b, *c;
|
||||
|
||||
public:
|
||||
STDIndicesStream(const int, int) noexcept;
|
||||
~STDIndicesStream() = default;
|
||||
~STDIndicesStream();
|
||||
|
||||
virtual void copy() override;
|
||||
virtual void add() override;
|
||||
|
||||
@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD
|
||||
ccall - Compile for all supported compute capabilities"
|
||||
"")
|
||||
|
||||
register_flag_optional(USE_TBB
|
||||
"Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
|
||||
"OFF")
|
||||
|
||||
register_flag_optional(USE_ONEDPL
|
||||
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
|
||||
|
||||
Possible values are:
|
||||
OPENMP - Implements policies using OpenMP.
|
||||
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
|
||||
TBB - Implements policies using TBB.
|
||||
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
|
||||
DPCPP - Implements policies through SYCL2020.
|
||||
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
|
||||
"OFF")
|
||||
|
||||
macro(setup)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
if (NVHPC_OFFLOAD)
|
||||
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
|
||||
# propagate flags to linker so that it links with the gpu stuff as well
|
||||
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
|
||||
register_append_link_flags(${NVHPC_FLAGS})
|
||||
endif ()
|
||||
|
||||
|
||||
if (USE_TBB)
|
||||
register_link_library(TBB::tbb)
|
||||
endif ()
|
||||
if (USE_ONEDPL)
|
||||
register_definitions(USE_ONEDPL)
|
||||
register_link_library(oneDPL)
|
||||
endif ()
|
||||
endmacro()
|
||||
|
||||
@ -5,25 +5,45 @@
|
||||
// source code
|
||||
|
||||
#include "STDRangesStream.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <execution>
|
||||
#include <ranges>
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
|
||||
: array_size{ARRAY_SIZE}
|
||||
noexcept : array_size{ARRAY_SIZE},
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
{
|
||||
a = std::vector<T>(array_size);
|
||||
b = std::vector<T>(array_size);
|
||||
c = std::vector<T>(array_size);
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
std::cout << "Using oneDPL backend: ";
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
|
||||
#elif ONEDPL_USE_TBB_BACKEND
|
||||
std::cout << "TBB " TBB_VERSION_STRING;
|
||||
#elif ONEDPL_USE_OPENMP_BACKEND
|
||||
std::cout << "OpenMP";
|
||||
#else
|
||||
std::cout << "Default";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class T>
|
||||
STDRangesStream<T>::~STDRangesStream() {
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size, // loop range
|
||||
[&] (int i) {
|
||||
a[i] = initA;
|
||||
@ -37,16 +57,16 @@ template <class T>
|
||||
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
// Element-wise copy.
|
||||
h_a = a;
|
||||
h_b = b;
|
||||
h_c = c;
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDRangesStream<T>::copy()
|
||||
{
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size,
|
||||
[&] (int i) {
|
||||
c[i] = a[i];
|
||||
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
|
||||
const T scalar = startScalar;
|
||||
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size,
|
||||
[&] (int i) {
|
||||
b[i] = scalar * c[i];
|
||||
@ -72,7 +92,7 @@ template <class T>
|
||||
void STDRangesStream<T>::add()
|
||||
{
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size,
|
||||
[&] (int i) {
|
||||
c[i] = a[i] + b[i];
|
||||
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
|
||||
const T scalar = startScalar;
|
||||
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size,
|
||||
[&] (int i) {
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
|
||||
const T scalar = startScalar;
|
||||
|
||||
std::for_each_n(
|
||||
std::execution::par_unseq,
|
||||
exe_policy,
|
||||
std::views::iota(0).begin(), array_size,
|
||||
[&] (int i) {
|
||||
a[i] += b[i] + scalar * c[i];
|
||||
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
|
||||
// sum += a[i] * b[i];
|
||||
return
|
||||
std::transform_reduce(
|
||||
std::execution::par_unseq,
|
||||
a.begin(), a.end(), b.begin(), 0.0);
|
||||
exe_policy,
|
||||
a, a + array_size, b, T{});
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -135,4 +155,3 @@ std::string getDeviceDriver(const int)
|
||||
|
||||
template class STDRangesStream<float>;
|
||||
template class STDRangesStream<double>;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user