diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 08eed2d..2e54201 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -12,12 +12,12 @@ on: jobs: test-rust: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/rust/rust-stream steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup project run: rustup install nightly - name: Compile project @@ -28,12 +28,12 @@ jobs: run: ./target/release/rust-stream --arraysize 2048 test-java: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/java/java-stream steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Test build project run: ./mvnw clean package - name: Test run @@ -41,12 +41,12 @@ jobs: run: java -jar target/java-stream.jar --arraysize 2048 test-julia: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/julia/JuliaStream.jl steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup project run: julia --project -e 'import Pkg; Pkg.instantiate()' - name: Test run PlainStream.jl @@ -70,14 +70,22 @@ jobs: test-cpp: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - name: Maximize build space + uses: easimon/maximize-build-space@v8 + with: + root-reserve-mb: 8192 + swap-size-mb: 512 + remove-android: 'true' + remove-codeql: 'true' + + - uses: actions/checkout@v4 - name: Cache compiler if: ${{ !env.ACT }} id: prepare-compilers - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ./compilers key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} @@ -90,9 +98,9 @@ jobs: run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true # Enable tmate debugging of manually-triggered workflows if the input option was provided - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} + # - name: Setup tmate session + # uses: mxschmitt/action-tmate@v3 + # if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - name: Test compile gcc @ CMake 3.13 if: ${{ ! cancelled() }} @@ -167,4 +175,65 @@ jobs: run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }} - name: Test compile hipsycl @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} \ No newline at end of file + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} + + - name: Test compile gcc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile clang @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile nvhpc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aocc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aomp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hip @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile dpcpp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hipsycl @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }} + + - name: Test compile gcc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile clang @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile nvhpc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aocc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aomp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hip @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile dpcpp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hipsycl @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }} + + test-futhark: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Prepare Futhark compiler + uses: diku-dk/install-futhark@HEAD + with: + version: 'latest' + - run: cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=multicore + - run: cmake --build build diff --git a/.gitignore b/.gitignore index 012d0e8..59ea5db 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,18 @@ sycl-stream hip-stream tbb-stream +src/fortran/BabelStream +src/fortran/BabelStream.* + *.o *.bc *.sycl *.tar *.gz *.a +*.mod +*.cub +*.ptx KokkosCore_config.* diff --git a/CHANGELOG.md b/CHANGELOG.md index 854d1f9..76e868b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,32 @@ All notable changes to this project will be documented in this file. ## Unreleased +### Added +- Ability to build Kokkos and RAJA versions against existing packages. +- Thrust managed memory. +- HIP managed memory. +- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. +- New implementation in Fortran +- New implementation in [Futhark](https://futhark-lang.org/) +- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust +- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java +- JuliaStream.jl published to registry (pending #113) + ### Changed +- Fix std-data/std-indices compatibility with oneDPL, NVHPC, and AdaptiveCpp (a.k.a. hipSYCL). - RAJA CUDA CMake build issues resolved. +- Kokkos build updates (CXX version upgraded to C++17). +- Fix CUDA memory limit check. +- Fix CUDA CMake options for `-DMEM` and `-DCMAKE_CUDA_FLAGS`. +- Use long double for `check_solution` in case of large problem size. +- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. +- Updates to the HIP kernels and API usage. +- Number of thread-blocks in CUDA dot kernel implementation changed to 1024. +- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with AdaptiveCpp. +- Bumped Julia compat to 1.9 +- Bumped Scala to 3.3.1 +- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23) +- Upgrade CI to Ubuntu 22.04 ## [v4.0] - 2021-12-22 diff --git a/CMakeLists.txt b/CMakeLists.txt index 54034ee..27736b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,10 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) -project(BabelStream VERSION 4.0 LANGUAGES CXX) +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif () + +project(BabelStream VERSION 5.0 LANGUAGES CXX C) # uncomment for debugging build issues: #set(CMAKE_VERBOSE_MAKEFILE ON) @@ -27,8 +31,6 @@ endmacro() # the final executable name set(EXE_NAME babelstream) -# select default build type -set(CMAKE_BUILD_TYPE "Release") # for chrono and some basic CXX features, models can overwrite this if required set(CMAKE_CXX_STANDARD 11) @@ -71,6 +73,75 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS " # Honor user's CXX_EXTRA_LINK_FLAGS set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) +option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that + don't explicitly link against TBB is a no-op, see description of your selected + model on how this is used." OFF) + +option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF) +set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON") + +if (FETCH_TBB) + FetchContent_Declare( + TBB + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG "${FETCH_TBB_VERSION}" + ) + # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + set(TBB_STRICT OFF) + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(TBB) + if (NOT TBB_POPULATED) + FetchContent_Populate(TBB) + add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL) + endif () +endif () + +option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that + don't explicitly link against DPL is a no-op, see description of your selected + model on how this is used." OFF) + +option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF) +set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON") + +if (FETCH_ONEDPL) + FetchContent_Declare( + oneDPL + GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git + GIT_TAG "${FETCH_ONEDPL_VERSION}" + ) + string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) + # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package + if (ONEDPL_BACKEND STREQUAL "openmp") + set(ONEDPL_BACKEND omp) + endif () + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(oneDPL) + if (NOT oneDPL_POPULATED) + FetchContent_Populate(oneDPL) + if (USE_TBB) + macro(find_package NAME) + if ("${NAME}" STREQUAL "TBB") + message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})") + else () + _find_package(${NAME} ${ARGN}) + endif () + endmacro() + endif () + add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) + + # Fixup oneDPL's omission on setting DPCPP definitions. + # We do this after the creation of the oneDPL target. + if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$") + target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1) + endif () + + endif () +endif () + + # include our macros include(cmake/register_models.cmake) @@ -84,12 +155,14 @@ register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) -register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) register_model(tbb TBB TBBStream.cpp) register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust +register_model(futhark FUTHARK FutharkStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") @@ -101,6 +174,12 @@ else () message(STATUS "Selected model : ${MODEL}") endif () +if (MODEL STREQUAL "sycl2020") + message(FATAL_ERROR " + Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available. + Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM") +endif () + # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL load_model(${MODEL}) @@ -151,6 +230,7 @@ include_directories(src) add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) +target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES}) if (CXX_EXTRA_LIBRARIES) target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES}) diff --git a/README.md b/README.md index 5791641..487f8e9 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,10 @@ BabelStream is currently implemented in the following parallel programming model - C++ Parallel STL - Kokkos - RAJA -- SYCL and SYCL 2020 +- SYCL and SYCL2020 (USM and accessors) - TBB - Thrust (via CUDA or HIP) +- Futhark This project also contains implementations in alternative languages with different build systems: * Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl) @@ -101,7 +102,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust;futhark ``` #### Overriding default flags @@ -165,7 +166,7 @@ The `MODEL` variant selects one implementation of BabelStream to build. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust ``` ### GNU Make diff --git a/src/.gitignore b/src/.gitignore index 568a953..9d8b17b 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -16,6 +16,8 @@ **/*.gz **/*.a +**/*.swp + **/KokkosCore_Config_* **/.DS_Store @@ -26,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp index 1e38c8b..48b9f2d 100644 --- a/src/acc/ACCStream.cpp +++ b/src/acc/ACCStream.cpp @@ -149,7 +149,7 @@ void ACCStream::nstream() template T ACCStream::dot() { - T sum = 0.0; + T sum{}; int array_size = this->array_size; T * restrict a = this->a; diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 656d338..f5c1a70 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -83,6 +83,8 @@ get() { if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then echo "$name not found, downloading..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" + else + echo "$name found, skipping download..." fi fi } @@ -92,13 +94,15 @@ get_and_untar() { local pkg_url="$2" if [ "$SETUP" = true ]; then if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then - echo "$name not found, downloading..." + echo "$name not found, downloading ($pkg_url)..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" fi echo "Preparing to extract $name ..." tar -xf "$name" echo "$name extracted, deleting archive ..." rm -f "$name" # delete for space + else + echo "Skipping setup for $name ($pkg_url)..." fi } @@ -119,10 +123,10 @@ verify_dir_exists() { setup_aocc() { echo "Preparing AOCC" - local aocc_ver="2.3.0" + local aocc_ver="4.0.0" local tarball="aocc-$aocc_ver.tar.xz" # XXX it's actually XZ compressed, so it should be tar.xz - local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar" + local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar" # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar" get_and_untar "$tarball" "$AOCC_URL" @@ -134,20 +138,26 @@ setup_aocc() { setup_nvhpc() { echo "Preparing Nvidia HPC SDK" - local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A + local nvhpc_release="2023_231" + local cuda_ver="12.0" + + local tarball="nvhpc_$nvhpc_ver.tar.gz" + + local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" + local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver" local bin_dir="$sdk_dir/compilers/bin" - "$bin_dir/makelocalrc" "$bin_dir" -x + "$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12 export_var NVHPC_SDK_DIR "$sdk_dir" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver" export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + export_var NVHPC_NVCC "$bin_dir/nvcc" + export_var NVHPC_CUDA_VER "$cuda_ver" +# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc" echo "Installed CUDA versions:" ls "$sdk_dir/cuda" @@ -160,7 +170,8 @@ setup_nvhpc() { setup_aomp() { echo "Preparing AOMP" - local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb" + local aomp_ver="18.0-0" + local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb" # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb" get_and_install_deb "aomp" "aomp" "$AOMP_URL" @@ -183,9 +194,10 @@ setup_oclcpu() { setup_kokkos() { echo "Preparing Kokkos" - local kokkos_ver="3.3.01" + local kokkos_ver="4.1.00" local tarball="kokkos-$kokkos_ver.tar.gz" + local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz" # local url="http://localhost:8000/$kokkos_ver.tar.gz" @@ -197,10 +209,10 @@ setup_kokkos() { setup_raja() { echo "Preparing RAJA" - local raja_ver="0.13.0" + local raja_ver="2023.06.1" local tarball="raja-$raja_ver.tar.gz" - local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz" + local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz" # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz" get_and_untar "$tarball" "$url" @@ -211,7 +223,7 @@ setup_raja() { setup_tbb() { echo "Preparing TBB" - local tbb_ver="2021.2.0" + local tbb_ver="2021.9.0" local tarball="oneapi-tbb-$tbb_ver-lin.tgz" local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" @@ -225,9 +237,9 @@ setup_tbb() { setup_clang_gcc() { - sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev + sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6 - export_var GCC_CXX "$(which g++-10)" + export_var GCC_CXX "$(which g++-12)" verify_bin_exists "$GCC_CXX" "$GCC_CXX" --version @@ -248,7 +260,11 @@ setup_clang_gcc() { } setup_rocm() { - sudo apt-get install -y -qq rocm-dev rocthrust-dev + if [ "$SETUP" = true ]; then + sudo apt-get install -y rocm-dev rocthrust-dev + else + echo "Skipping apt setup for ROCm" + fi export_var ROCM_PATH "/opt/rocm" export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" @@ -259,7 +275,7 @@ setup_rocm() { setup_dpcpp() { - local nightly="20210106" + local nightly="20230615" local tarball="dpcpp-$nightly.tar.gz" local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz" @@ -276,22 +292,22 @@ setup_dpcpp() { setup_hipsycl() { sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev - local hipsycl_ver="0.9.0" + local hipsycl_ver="0.9.1" local tarball="v$hipsycl_ver.tar.gz" local install_dir="$PWD/hipsycl_dist_$hipsycl_ver" - local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz" - # local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz" + local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz" + # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz" get_and_untar "$tarball" "$url" if [ "$SETUP" = true ]; then - local src="$PWD/hipSYCL-$hipsycl_ver" + local src="$PWD/AdaptiveCpp-$hipsycl_ver" rm -rf "$src/build" rm -rf "$install_dir" cmake "-B$src/build" "-H$src" \ - -DCMAKE_C_COMPILER="$(which gcc-10)" \ - -DCMAKE_CXX_COMPILER="$(which g++-10)" \ + -DCMAKE_C_COMPILER="$(which gcc-12)" \ + -DCMAKE_CXX_COMPILER="$(which g++-12)" \ -DCMAKE_INSTALL_PREFIX="$install_dir" \ -DWITH_ROCM_BACKEND=OFF \ -DWITH_CUDA_BACKEND=OFF \ @@ -306,25 +322,20 @@ setup_hipsycl() { check_size } -setup_computecpp() { - echo "TODO ComputeCpp requires registration+login to download" -} - if [ "${GITHUB_ACTIONS:-false}" = true ]; then echo "Running in GitHub Actions, defaulting to special export" TERM=xterm export TERM=xterm - # drop the lock in case we got one from a failed run - rm /var/lib/dpkg/lock-frontend || true - rm /var/cache/apt/archives/lock || true - - wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list + rm -rf /var/lib/dpkg/lock-frontend || true + rm -rf /var/cache/apt/archives/lock || true + mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 sudo apt-get update -qq - sudo apt-get install -y -qq cmake + sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev if [ "$SETUP" = true ]; then echo "Deleting extra packages for space in 2 seconds..." @@ -334,6 +345,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then sudo apt-get autoremove -y check_size fi + sudo apt-get upgrade -qq else echo "Running locally, defaulting to standard export" fi @@ -362,6 +374,18 @@ setup_cmake() { verify_bin_exists "$CMAKE_3_18_BIN" "$CMAKE_3_18_BIN" --version + get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh" + chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir + export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_20_BIN" + "$CMAKE_3_20_BIN" --version + + get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh" + chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir + export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_24_BIN" + "$CMAKE_3_24_BIN" --version + check_size } @@ -379,6 +403,10 @@ if [ "$PARALLEL" = true ]; then setup_tbb & wait else + # these need apt + setup_clang_gcc + setup_rocm + setup_hipsycl setup_cmake setup_aocc setup_oclcpu @@ -388,10 +416,6 @@ else setup_kokkos setup_raja setup_tbb - # these need apt - setup_clang_gcc - setup_rocm - setup_hipsycl fi echo "Done!" diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 9388643..a67303c 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -120,9 +120,20 @@ run_build() { # CLANG_OMP_OFFLOAD_NVIDIA=false ### +NV_ARCH_CC="70" AMD_ARCH="gfx_903" -NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.4,cc80" +NV_ARCH="sm_${NV_ARCH_CC}" +NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80" + +check_cmake_ver(){ + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required=$1 + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + return 0 + else + return 1 + fi +} build_gcc() { local name="gcc_build" @@ -135,49 +146,61 @@ build_gcc() { "./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10 fi - # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + for use_onedpl in OFF OPENMP TBB; do + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + esac + # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here + run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then - run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" + run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none" run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" fi if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then - run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" + run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none" run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" - run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" + run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi -# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 -# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 + if check_cmake_ver "3.20.0"; then + run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ + -DENABLE_CUDA=ON \ + -DTARGET=NVIDIA \ + -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ + -DCUDA_ARCH=$NV_ARCH" + else + echo "Skipping RAJA models due to CMake version requirement" + fi -# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ -# -DENABLE_CUDA=ON \ -# -DTARGET=NVIDIA \ -# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ -# -DCUDA_ARCH=$NV_ARCH" - - - # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements - local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) - local required="3.15.0" - if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" +# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" # FIXME CUDA Thrust + TBB throws the following error: # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined @@ -187,9 +210,9 @@ build_gcc() { # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined - # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB" else - echo "CMake version ${current} < ${required}, skipping Thrust models" + echo "Skipping Thrust models due to CMake version requirement" fi } @@ -207,28 +230,39 @@ build_clang() { run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" - run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + + for use_onedpl in OFF OPENMP TBB; do + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + esac + run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported + done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB - - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi # no clang /w RAJA+cuda because it needs nvcc which needs gcc } @@ -237,6 +271,7 @@ build_nvhpc() { local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } @@ -254,6 +289,8 @@ build_hip() { local name="hip_build" run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT" run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } @@ -275,15 +312,18 @@ build_icpc() { local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" run_build $name "${ICPC_CXX:?}" omp "$cxx" run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" -} + if check_cmake_ver "3.20.0"; then + run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi + + if check_cmake_ver "3.16.0"; then + run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi -build_computecpp() { - run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ - -DSYCL_COMPILER=COMPUTECPP \ - -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \ - -DOpenCL_LIBRARY=${OCL_LIB:?}" } build_dpcpp() { diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index b467d00..75a8f3c 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -42,41 +42,57 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif array_size = ARRAY_SIZE; + + // Query device for sensible dot kernel block count + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_index); + check_error(); + dot_num_blocks = props.multiProcessorCount * 4; + // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(sizeof(T) * dot_num_blocks); + + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 4; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers #if defined(MANAGED) - cudaMallocManaged(&d_a, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_a, array_bytes); check_error(); - cudaMallocManaged(&d_b, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_b, array_bytes); check_error(); - cudaMallocManaged(&d_c, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_c, array_bytes); check_error(); - cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #elif defined(PAGEFAULT) - d_a = (T*)malloc(sizeof(T)*ARRAY_SIZE); - d_b = (T*)malloc(sizeof(T)*ARRAY_SIZE); - d_c = (T*)malloc(sizeof(T)*ARRAY_SIZE); - d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS); + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); + d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); #else - cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_a, array_bytes); check_error(); - cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_b, array_bytes); check_error(); - cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_c, array_bytes); check_error(); - cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #endif } @@ -237,7 +253,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) int i = blockDim.x * blockIdx.x + threadIdx.x; const size_t local_i = threadIdx.x; - tb_sum[local_i] = 0.0; + tb_sum[local_i] = {}; for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; @@ -257,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, d_sum, array_size); check_error(); #if defined(MANAGED) || defined(PAGEFAULT) cudaDeviceSynchronize(); check_error(); #else - cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost); check_error(); #endif T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < dot_num_blocks; i++) { #if defined(MANAGED) || defined(PAGEFAULT) sum += d_sum[i]; diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 83b8c66..d16511f 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -13,16 +13,9 @@ #include "Stream.h" -#if defined(PAGEFAULT) - #define IMPLEMENTATION_STRING "CUDA - Page Fault" -#elif defined(MANAGED) - #define IMPLEMENTATION_STRING "CUDA - Managed Memory" -#else - #define IMPLEMENTATION_STRING "CUDA" -#endif +#define IMPLEMENTATION_STRING "CUDA" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 template class CUDAStream : public Stream @@ -40,6 +33,8 @@ class CUDAStream : public Stream T *d_c; T *d_sum; + // Number of blocks for dot kernel + int dot_num_blocks; public: diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 8c6b568..7c1b0d6 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -29,10 +29,11 @@ macro(setup) endif() enable_language(CUDA) - register_definitions(MEM=${MEM}) + register_definitions(${MEM}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later diff --git a/src/dpl_shim.h b/src/dpl_shim.h new file mode 100644 index 0000000..b954019 --- /dev/null +++ b/src/dpl_shim.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_ONEDPL + +// oneDPL C++17 PSTL + +#include +#include +#include + +#if ONEDPL_USE_DPCPP_BACKEND + +#include + +const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ + oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) +}; + +template +T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } + +template +void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } + +#else + +// auto exe_policy = dpl::execution::seq; +// auto exe_policy = dpl::execution::par; +static constexpr auto exe_policy = dpl::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + +#endif + +#else + +// Normal C++17 PSTL + +#include +#include +#include + +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +static constexpr auto exe_policy = std::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + + +#endif + +#ifdef USE_STD_PTR_ALLOC_DEALLOC + +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) +#include + +// TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete +// for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled +static cl::sycl::queue queue{cl::sycl::default_selector_v}; +template T *alloc_raw(size_t size) { return cl::sycl::malloc_shared(size, queue); } +template void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); } + +#else +template +T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } + +template +void dealloc_raw(T *ptr) { free(ptr); } +#endif + +#endif diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90 new file mode 100644 index 0000000..5a8d5bc --- /dev/null +++ b/src/fortran/ArrayStream.F90 @@ -0,0 +1,105 @@ +module ArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=5), parameter :: implementation_name = "Array" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + A = initA + B = initB + C = initC + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + h_A = A + h_B = B + h_C = C + end subroutine read_arrays + + subroutine copy() + implicit none + C = A + end subroutine copy + + subroutine add() + implicit none + C = A + B + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + B = scalar * C + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = B + scalar * C + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = A + B + scalar * C + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + s = dot_product(A,B) + end function dot + +end module ArrayStream diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90 new file mode 100644 index 0000000..dd01d35 --- /dev/null +++ b/src/fortran/BabelStreamTypes.F90 @@ -0,0 +1,21 @@ +module BabelStreamTypes + use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 + + implicit none + +#ifdef USE_FLOAT + integer, parameter :: StreamRealKind = REAL32 + character(len=6) :: StreamRealName = "REAL32" +#else + integer, parameter :: StreamRealKind = REAL64 + character(len=6) :: StreamRealName = "REAL64" +#endif + +#ifdef USE_INT32 +#warning There is no checking for overflowing INT32, so be careful. + integer, parameter :: StreamIntKind = INT32 +#else + integer, parameter :: StreamIntKind = INT64 +#endif + +end module BabelStreamTypes diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90 new file mode 100644 index 0000000..01668ea --- /dev/null +++ b/src/fortran/CUDAKernelStream.F90 @@ -0,0 +1,230 @@ +module CUDAKernelStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "CUDAKernel" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + B(i) = scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer(kind=StreamIntKind) :: i + integer :: err + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAKernelStream diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90 new file mode 100644 index 0000000..208f1aa --- /dev/null +++ b/src/fortran/CUDAStream.F90 @@ -0,0 +1,309 @@ +module CUDAFortranKernels + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + contains + + attributes(global) subroutine do_copy(n,A,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + endif + end subroutine do_copy + + attributes(global) subroutine do_add(n,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + B(i) + endif + end subroutine do_add + + attributes(global) subroutine do_mul(n,scalar,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: B(n) + real(kind=REAL64), intent(in) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + B(i) = scalar * C(i) + endif + end subroutine do_mul + + attributes(global) subroutine do_triad(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = B(i) + scalar * C(i) + endif + end subroutine do_triad + + attributes(global) subroutine do_nstream(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine do_nstream + +#if 0 + attributes(global) subroutine do_dot(n,A,B,r) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: r + integer(kind=StreamIntKind) :: i + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + end subroutine do_dot +#endif + +end module CUDAFortranKernels + +module CUDAStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + use cudafor, only: dim3 + + implicit none + + character(len=4), parameter :: implementation_name = "CUDA" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + type(dim3) :: grid, tblock + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + ! move to separate subroutine later + tblock = dim3(128,1,1) + grid = dim3(ceiling(real(N)/tblock%x),1,1) + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use CUDAFortranKernels, only: do_copy + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_copy<<>>(N, A, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use CUDAFortranKernels, only: do_add + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_add<<>>(N, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use CUDAFortranKernels, only: do_mul + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_mul<<>>(N, scalar, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use CUDAFortranKernels, only: do_triad + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_triad<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use CUDAFortranKernels, only: do_nstream + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_nstream<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + !use CUDAFortranKernels, only: do_dot + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer :: err + integer(kind=StreamIntKind) :: i + !call do_dot<<>>(N, B, C, r) + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAStream diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90 new file mode 100644 index 0000000..781210d --- /dev/null +++ b/src/fortran/DoConcurrentStream.F90 @@ -0,0 +1,139 @@ +module DoConcurrentStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "DoConcurrent" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_DEVICE + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,C) + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(B,C) + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + ! reduction omitted because NVF infers it and other compilers do not support + s = real(0,kind=REAL64) +#ifdef CRAY_THREAD_DOCONCURRENT + do i=1,N +#else + do concurrent (i=1:N) !shared(A,B) +#endif + s = s + A(i) * B(i) + end do + end function dot + +end module DoConcurrentStream diff --git a/src/fortran/Makefile b/src/fortran/Makefile new file mode 100644 index 0000000..adadcff --- /dev/null +++ b/src/fortran/Makefile @@ -0,0 +1,109 @@ +ifeq ($(COMPILER),nvhpc) + include make.inc.nvhpc +else ifeq ($(COMPILER),oneapi) + include make.inc.oneapi +else ifeq ($(COMPILER),gcc) + include make.inc.gcc +else ifeq ($(COMPILER),amd) + include make.inc.amd +else ifeq ($(COMPILER),arm) + include make.inc.arm +else ifeq ($(COMPILER),cray) + include make.inc.cray +else ifeq ($(COMPILER),fj) + include make.inc.fj +else + $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.) + include make.inc.gcc + COMPILER=gcc +endif + +FCFLAGS += -DVERSION_STRING="5.0" +#FCFLAGS += -DUSE_INT32 + +ifeq ($(IMPLEMENTATION),DoConcurrent) + FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG) + IMPLEMENTATION_OBJECT = DoConcurrentStream.o + +else ifeq ($(IMPLEMENTATION),Array) + FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG) + IMPLEMENTATION_OBJECT = ArrayStream.o + +else ifeq ($(IMPLEMENTATION),OpenMP) + FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPWorkshare) + FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTarget) + FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop) + FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o + +else ifeq ($(IMPLEMENTATION),OpenACC) + FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCStream.o + +else ifeq ($(IMPLEMENTATION),OpenACCArray) + FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCArrayStream.o + +else ifeq ($(IMPLEMENTATION),CUDA) + FCFLAGS += -DUSE_CUDA $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAStream.o + +else ifeq ($(IMPLEMENTATION),CUDAKernel) + FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAKernelStream.o + +else ifeq ($(IMPLEMENTATION),Sequential) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION_OBJECT = SequentialStream.o + +else + $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION=Sequential + IMPLEMENTATION_OBJECT = SequentialStream.o + +endif + +all: BabelStream.$(COMPILER).$(IMPLEMENTATION) + +BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT) + $(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@ + +BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90 + $(FC) $(FCFLAGS) -c $< + +%.o: %.F90 BabelStreamTypes.mod + $(FC) $(FCFLAGS) -c $< + +clean: + -rm -f main.o BabelStreamUtil.mod babelstreamutil.mod + -rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod + -rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod + -rm -f ArrayStream.o ArrayStream.mod arraystream.mod + -rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod + -rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod + -rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod + -rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod + -rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod + -rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod + -rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod + -rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod + -rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod + -rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod + -rm -f *.modmic *.mod *.o *.cub *.ptx + +realclean: clean + -rm -f BabelStream.* diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90 new file mode 100644 index 0000000..9225fe7 --- /dev/null +++ b/src/fortran/OpenACCArrayStream.F90 @@ -0,0 +1,144 @@ +module OpenACCArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenACCArray" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$acc kernels + A = initA + B = initB + C = initC + !$acc end kernels + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$acc kernels + h_A = A + h_B = B + h_C = C + !$acc end kernels + end subroutine read_arrays + + subroutine copy() + implicit none + !$acc kernels + C = A + !$acc end kernels + end subroutine copy + + subroutine add() + implicit none + !$acc kernels + C = A + B + !$acc end kernels + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + B = scalar * C + !$acc end kernels + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = B + scalar * C + !$acc end kernels + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = A + B + scalar * C + !$acc end kernels + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$acc kernels + s = dot_product(A,B) + !$acc end kernels + end function dot + +end module OpenACCArrayStream diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90 new file mode 100644 index 0000000..7326f38 --- /dev/null +++ b/src/fortran/OpenACCStream.F90 @@ -0,0 +1,161 @@ +module OpenACCStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=7), parameter :: implementation_name = "OpenACC" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$acc parallel loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenACCStream diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90 new file mode 100644 index 0000000..7316d5b --- /dev/null +++ b/src/fortran/OpenMPStream.F90 @@ -0,0 +1,137 @@ +module OpenMPStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=6), parameter :: implementation_name = "OpenMP" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPStream diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90 new file mode 100644 index 0000000..9684ced --- /dev/null +++ b/src/fortran/OpenMPTargetLoopStream.F90 @@ -0,0 +1,162 @@ +module OpenMPTargetLoopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=16), parameter :: implementation_name = "OpenMPTargetLoop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetLoopStream diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90 new file mode 100644 index 0000000..0206d78 --- /dev/null +++ b/src/fortran/OpenMPTargetStream.F90 @@ -0,0 +1,163 @@ +module OpenMPTargetStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenMPTarget" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams distribute parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + end do + !$omp barrier + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams distribute parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetStream diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90 new file mode 100644 index 0000000..579a761 --- /dev/null +++ b/src/fortran/OpenMPTaskloopStream.F90 @@ -0,0 +1,169 @@ +module OpenMPTaskloopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=14), parameter :: implementation_name = "OpenMPTaskloop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + !$omp end master + !$omp end parallel + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + !$omp end master + !$omp end parallel + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + end do + !$omp end master + !$omp end parallel + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + B(i) + end do + !$omp end master + !$omp end parallel + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + B(i) = scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel + !$omp master + !$omp taskloop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + !$omp end master + !$omp end parallel + end function dot + +end module OpenMPTaskloopStream diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90 new file mode 100644 index 0000000..fd50f86 --- /dev/null +++ b/src/fortran/OpenMPWorkshareStream.F90 @@ -0,0 +1,120 @@ +module OpenMPWorkshareStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=15), parameter :: implementation_name = "OpenMPWorkshare" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$omp parallel workshare + A = initA + B = initB + C = initC + !$omp end parallel workshare + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$omp parallel workshare + h_A = A + h_B = B + h_C = C + !$omp end parallel workshare + end subroutine read_arrays + + subroutine copy() + implicit none + !$omp parallel workshare + C = A + !$omp end parallel workshare + end subroutine copy + + subroutine add() + implicit none + !$omp parallel workshare + C = A + B + !$omp end parallel workshare + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + B = scalar * C + !$omp end parallel workshare + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = B + scalar * C + !$omp end parallel workshare + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = A + B + scalar * C + !$omp end parallel workshare + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$omp parallel workshare + s = dot_product(A,B) + !$omp end parallel workshare + end function dot + +end module OpenMPWorkshareStream diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90 new file mode 100644 index 0000000..a8f6917 --- /dev/null +++ b/src/fortran/SequentialStream.F90 @@ -0,0 +1,130 @@ +module SequentialStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "Sequential" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module SequentialStream diff --git a/src/fortran/build.sh b/src/fortran/build.sh new file mode 100755 index 0000000..9343354 --- /dev/null +++ b/src/fortran/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# uncomment to disable GPU targets +#HAS_GPU=0 + +# Orin +#if [ "x${compiler}" == "xgcc" ] ; then +# export MCPU=cortex-a78ae +#fi +#if [ "x${compiler}" == "xarm" ] ; then +# export MCPU=cortex-a78 +#fi + +COMPILERS="gcc" +if [ $(which nvfortran) ] ; then + COMPILERS="${COMPILERS} nvhpc" +fi +if [ $(which crayftn) ] ; then + COMPILERS="${COMPILERS} cray" +fi +if [ $(uname -m) == "aarch64" ] ; then + if [ $(which armflang) ] ; then + COMPILERS="${COMPILERS} arm" + fi + if [ $(which frt) ] ; then + COMPILERS="${COMPILERS} fj" + fi +elif [ $(uname -m) == "x86_64" ] ; then + if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then + COMPILERS="${COMPILERS} oneapi" + if [ -f /opt/intel/oneapi/setvars.sh ] ; then + . /opt/intel/oneapi/setvars.sh >& /dev/null + fi + else + # ^ this detection can be improved + COMPILERS="${COMPILERS} amd" + fi +fi + +for compiler in ${COMPILERS} ; do + TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" + if [ "${HAS_GPU}" != "0" ] ; then + TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" + if [ "x${compiler}" == "xnvhpc" ] ; then + TARGETS="${TARGETS} CUDA CUDAKernel" + fi + fi + if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then + TARGETS="${TARGETS} OpenACC OpenACCArray" + fi + for implementation in ${TARGETS} ; do + make COMPILER=${compiler} IMPLEMENTATION=${implementation} + done +done diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 new file mode 100644 index 0000000..153be93 --- /dev/null +++ b/src/fortran/main.F90 @@ -0,0 +1,683 @@ +module BabelStreamUtil + use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 + use BabelStreamTypes + + implicit none + + integer(kind=StreamIntKind) :: array_size = 33554432 + integer(kind=StreamIntKind) :: num_times = 100 + logical :: mibibytes = .false. + logical :: use_gigs = .false. + logical :: csv = .false. + character(len=1), parameter :: csv_sep = "," + + ! 1 = All + ! 2 = Triad + ! 3 = Nstream + integer :: selection = 1 + + real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64) + real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64) + real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64) + real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64) + + contains + + function get_wtime() result(t) +#if defined(USE_OMP_GET_WTIME) + use omp_lib + implicit none + real(kind=REAL64) :: t + t = omp_get_wtime() +#elif defined(USE_CPU_TIME) + implicit none + real(kind=REAL64) :: t + real :: r + call cpu_time(r) + t = r +#else + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +#endif + end function get_wtime + + subroutine parseArguments() + use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream, only: list_devices, set_device +#elif defined(USE_ARRAY) + use ArrayStream, only: list_devices, set_device +#elif defined(USE_OPENMP) + use OpenMPStream, only: list_devices, set_device +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream, only: list_devices, set_device +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream, only: list_devices, set_device +#elif defined(USE_OPENACC) + use OpenACCStream, only: list_devices, set_device +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream, only: list_devices, set_device +#elif defined(USE_CUDA) + use CUDAStream, only: list_devices, set_device +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream, only: list_devices, set_device +#elif defined(USE_SEQUENTIAL) + use SequentialStream, only: list_devices, set_device +#endif + implicit none + integer :: i, argc + integer :: arglen,err,pos(2) + character(len=64) :: argtmp + argc = command_argument_count() + do i=1,argc + call get_command_argument(i,argtmp,arglen,err) + if (err.eq.0) then + ! + ! list devices + ! + pos(1) = index(argtmp,"--list") + if (pos(1).eq.1) then + call list_devices() + stop + endif + ! + ! set device number + ! + pos(1) = index(argtmp,"--device") + if (pos(1).eq.1) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + stop + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer :: dev + read(argtmp,'(i15)') dev + call set_device(dev) + end block + endif + cycle + endif + ! + ! array size + ! + pos(1) = index(argtmp,"--arraysize") + pos(2) = index(argtmp,"-s") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer(kind=INT64) :: big_size + read(argtmp,'(i15)') big_size + if (big_size .gt. HUGE(array_size)) then + print*,'Array size does not fit into integer:' + print*,big_size,'>',HUGE(array_size) + print*,'Stop using USE_INT32' + stop + else + array_size = INT(big_size,kind=StreamIntKind) + endif + end block + endif + cycle + endif + ! + ! number of iterations + ! + pos(1) = index(argtmp,"--numtimes") + pos(2) = index(argtmp,"-n") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + read(argtmp,'(i15)') num_times + if (num_times.lt.2) then + write(*,'(a)') "Number of times must be 2 or more" + stop + end if + endif + cycle + endif + ! + ! precision + ! + pos(1) = index(argtmp,"--float") + if (pos(1).eq.1) then + write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", & + "to run BabelStream in single precision." + stop + endif + ! + ! selection (All, Triad, Nstream) + ! + pos(1) = index(argtmp,"--triad-only") + if (pos(1).eq.1) then + selection = 2 + cycle + endif + pos(1) = index(argtmp,"--nstream-only") + if (pos(1).eq.1) then + selection = 3 + cycle + endif + ! + ! CSV + ! + pos(1) = index(argtmp,"--csv") + if (pos(1).eq.1) then + csv = .true. + !write(*,'(a39)') "Sorry, CSV support isn't available yet." + !stop + endif + ! + ! units + ! + pos(1) = index(argtmp,"--mibibytes") + if (pos(1).eq.1) then + mibibytes = .true. + cycle + endif + ! + ! giga/gibi instead of mega/mebi + ! + pos(1) = index(argtmp,"--gigs") + if (pos(1).eq.1) then + use_gigs = .true. + cycle + endif + ! + ! + ! + pos(1) = index(argtmp,"--compiler-info") + if (pos(1).eq.1) then + write(*,'(a)') 'Compiler version: ',compiler_version() + write(*,'(a)') 'Compiler options: ',compiler_options() + stop + endif + ! + ! help + ! + pos(1) = index(argtmp,"--help") + pos(2) = index(argtmp,"-h") + if (any(pos(:).eq.1) ) then + call get_command_argument(0,argtmp,arglen,err) + write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]" + write(*,'(a)') "Options:" + write(*,'(a)') " -h --help Print the message" + write(*,'(a)') " --list List available devices" + write(*,'(a)') " --device INDEX Select device at INDEX" + write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array" + write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)" + !write(*,'(a)') " --float Use floats (rather than doubles)" + write(*,'(a)') " --triad-only Only run triad" + write(*,'(a)') " --nstream-only Only run nstream" + write(*,'(a)') " --csv Output as csv table" + write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB" + write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit." + stop + endif + end if + end do + end subroutine parseArguments + + subroutine run_all(timings, summ) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64), intent(out) :: summ + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call copy() + t2 = get_wtime() + timings(1,i) = t2-t1 + + t1 = get_wtime() + call mul(startScalar) + t2 = get_wtime() + timings(2,i) = t2-t1 + + t1 = get_wtime() + call add() + t2 = get_wtime() + timings(3,i) = t2-t1 + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(4,i) = t2-t1 + + t1 = get_wtime() + summ = dot() + t2 = get_wtime() + timings(5,i) = t2-t1 + + end do + + end subroutine run_all + + subroutine run_triad(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_triad + + subroutine run_nstream(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call nstream(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_nstream + + subroutine check_solution(A, B, C, summ) + use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal + implicit none + real(kind=REAL64), intent(in) :: A(:), B(:), C(:) + real(kind=REAL64), intent(in) :: summ + + integer(kind=StreamIntKind) :: i + real(kind=REAL64) :: goldA, goldB, goldC, goldSum + real(kind=REAL64) :: scalar + + ! always use double because of accumulation error + real(kind=REAL64) :: errA, errB, errC, errSum, epsi + logical :: cleanA, cleanB, cleanC, cleanSum + + goldA = startA + goldB = startB + goldC = startC + goldSum = 0.0d0 + + scalar = startScalar + + do i=1,num_times + + if (selection.eq.1) then + goldC = goldA + goldB = scalar * goldC + goldC = goldA + goldB + goldA = goldB + scalar * goldC + else if (selection.eq.2) then + goldA = goldB + scalar * goldC + else if (selection.eq.3) then + goldA = goldA + goldB + scalar * goldC; + endif + + end do + + goldSum = goldA * goldB * array_size + + cleanA = ALL(IEEE_Is_Normal(A)) + cleanB = ALL(IEEE_Is_Normal(B)) + cleanC = ALL(IEEE_Is_Normal(C)) + cleanSum = IEEE_Is_Normal(summ) + + if (.not. cleanA) then + write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanB) then + write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanC) then + write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanSum) then + write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ + end if + + errA = SUM( ABS( A - goldA ) ) / array_size + errB = SUM( ABS( B - goldB ) ) / array_size + errC = SUM( ABS( C - goldC ) ) / array_size + errSum = ABS( (summ - goldSum) / goldSum) + + epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0 + + if (errA .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA + end if + if (errB .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB + end if + if (errC .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC + end if + + if (selection.eq.1) then + if (errSum .gt. 1.0e-8) then + write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum + write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum + end if + endif + + end subroutine check_solution + +end module BabelStreamUtil + +program BabelStream + use BabelStreamUtil +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + integer :: element_size, err + real(kind=REAL64) :: scaling + character(len=3) :: label + real(kind=REAL64), allocatable :: timings(:,:) + real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) + real(kind=REAL64) :: summ + real(kind=REAL64) :: init_tic, init_toc, read_tic, read_toc + + call parseArguments() + + element_size = storage_size(real(0,kind=StreamRealKind)) / 8 + + if (mibibytes) then + if (use_gigs) then + scaling = 2.0d0**(-30) + label = "GiB" + else + scaling = 2.0d0**(-20) + label = "MiB" + endif + else + if (use_gigs) then + scaling = 1.0d-9 + label = "GB" + else + scaling = 1.0d-6 + label = "MB" + endif + endif + + if (.not.csv) then + + write(*,'(a)') "BabelStream Fortran" + write(*,'(a9,f4.1)') "Version: ", VERSION_STRING + write(*,'(a16,a)') "Implementation: ", implementation_name + + block + character(len=32) :: printout + write(printout,'(i9,1x,a5)') num_times,'times' + write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout) + end block + write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName) + + write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label + write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label + + endif ! csv + + allocate( timings(5,num_times) ) + + call alloc(array_size) + + init_tic = get_wtime() + call init_arrays(startA, startB, startC) + init_toc = get_wtime() + summ = 0.0d0 + + if (.not.csv) then + write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Init: ',init_toc-init_tic, 's (=', & + (3.0d0 * element_size * array_size * scaling) / (init_toc-init_tic), TRIM(label), 'ytes/sec)' + end if + + + timings = -1.0d0 + if (selection.eq.1) then + call run_all(timings, summ) + else if (selection.eq.2) then + call run_triad(timings) + else if (selection.eq.3) then + call run_nstream(timings) + endif + + allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + + read_tic = get_wtime() + call read_arrays(h_A, h_B, h_C) + read_toc = get_wtime() + + if (.not.csv) then + write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Read: ',read_toc-read_tic, 's (=', & + (3.0d0 * element_size * array_size * scaling) / (read_toc-read_tic), TRIM(label), 'ytes/sec)' + end if + + call check_solution(h_A, h_B, h_C, summ) + + block + character(len=20) :: printout(8) + real(kind=REAL64) :: tmin,tmax,tavg,nbytes + + if (csv) then + write(*,'(a,a1)',advance='no') 'function', csv_sep + write(*,'(a,a1)',advance='no') 'num_times', csv_sep + write(*,'(a,a1)',advance='no') 'n_elements',csv_sep + write(*,'(a,a1)',advance='no') 'sizeof', csv_sep + if (mibibytes) then + write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep + else + write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep + endif + write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep + write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep + write(*,'(a,a1)',advance='yes') 'avg_runtime' + else + write(printout(1),'(a8)') 'Function' + write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec' + write(printout(3),'(a9)') 'Min (sec)' + write(printout(4),'(a3)') 'Max' + write(printout(5),'(a7)') 'Average' + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif ! csv + + if (selection.eq.1) then + block + integer, parameter :: sizes(5) = [2,2,3,3,2] + character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "] + integer :: i + do i=1,5 + tmin = MINVAL(timings(i,2:num_times)) + tmax = MAXVAL(timings(i,2:num_times)) + tavg = SUM(timings(i,2:num_times)) / (num_times-1) + nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i) + write(printout(1),'(a)') labels(i) + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + enddo + end block + else if ((selection.eq.2).or.(selection.eq.3)) then + tmin = MINVAL(timings(1,2:num_times)) + tmax = MAXVAL(timings(1,2:num_times)) + tavg = SUM(timings(1,2:num_times)) / (num_times-1) + if (selection.eq.2) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 3 + write(printout(1),'(a12)') "Triad" + else if (selection.eq.3) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 4 + write(printout(1),'(a12)') "Nstream" + endif + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + endif + end block + + call dealloc() + +end program BabelStream diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd new file mode 100644 index 0000000..a863de8 --- /dev/null +++ b/src/fortran/make.inc.amd @@ -0,0 +1,25 @@ +FC := /opt/rocm/llvm/bin/flang +FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang +FCFLAGS := -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +ifdef MARCH +FCFLAGS += -march=$(MARCH) +else +FCFLAGS += -march=native +endif + +DOCONCURRENT_FLAG = -fopenmp # libomp.so required +ARRAY_FLAG = -fopenmp # libomp.so required +OPENMP_FLAG = -fopenmp +#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm new file mode 100644 index 0000000..a3e2a67 --- /dev/null +++ b/src/fortran/make.inc.arm @@ -0,0 +1,39 @@ +FC = armflang +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -fopenmp +ARRAY_FLAG = -fopenmp +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray new file mode 100644 index 0000000..dae4e75 --- /dev/null +++ b/src/fortran/make.inc.cray @@ -0,0 +1,18 @@ +FC := ftn +FCFLAGS = -e F -O3 + +DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT +ARRAY_FLAG = -h autothread +OPENMP_FLAG = -h omp +OPENACC_FLAG = -h acc +# CPU only +OPENACC_FLAG += -h omp +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj new file mode 100644 index 0000000..b4761e5 --- /dev/null +++ b/src/fortran/make.inc.fj @@ -0,0 +1,21 @@ +FC := frt +FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution + +DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED +ARRAY_FLAG = -Kparallel,reduction +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = +# CPU only +OPENACC_FLAG += +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OPENACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc new file mode 100644 index 0000000..f59c8bb --- /dev/null +++ b/src/fortran/make.inc.gcc @@ -0,0 +1,33 @@ +FC = gfortran +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 +ARRAY_FLAG = +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc new file mode 100644 index 0000000..dd4c442 --- /dev/null +++ b/src/fortran/make.inc.nvhpc @@ -0,0 +1,70 @@ +FC := nvfortran +#FCFLAGS := -O3 -Minform=inform -Minfo=all +FCFLAGS := -O3 -Minform=warn + +#TARGET=gpu +TARGET=multicore + +NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") +ifeq ($(findstring Ampere,$(NVARCH)),Ampere) + $(info Ampere detected) + GPU = cc80 +endif +ifeq ($(findstring Turing,$(NVARCH)),Turing) + $(info Turing detected) + GPU = cc75 +endif +ifeq ($(findstring Volta,$(NVARCH)),Volta) + $(info Volta detected) + GPU = cc70 +endif +ifeq ($(findstring Pascal,$(NVARCH)),Pascal) + $(info Pascal detected) + GPU = cc60,cc61 +endif +ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) + $(info Jetson AGX Orin detected) + GPU = ccn87,cc86 + # figure out Xavier later + #GPU = cc72 +endif +ifeq ($(GPU),) + $(error Your GPU architecture could not be detected. Set it manually.) +endif +GPUFLAG = -gpu=$(GPU) + +# MARCH=neoverse-v1,neoverse-n1,zen3 +ARCH=$(shell uname -m) +ifdef MARCH + ifeq ($(ARCH),aarch64) + ifeq ($(MARCH),neoverse-n1) + FCFLAGS += -tp=$(MARCH) + else + ifeq ($(MARCH),neoverse-v1) + FCFLAGS += -tp=$(MARCH) + else + FCFLAGS += -tp=native + endif + endif + else + FCFLAGS += -tp=$(MARCH) + endif +else + FCFLAGS += -tp=native +endif + +# this is to allow apples-to-apples comparison with DC in non-DC GPU impls +# set exactly one of these! +#MANAGED = -DUSE_MANAGED -gpu=managed +#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged + +DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) +ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) +OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) +OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) +CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi new file mode 100644 index 0000000..b7e003c --- /dev/null +++ b/src/fortran/make.inc.oneapi @@ -0,0 +1,32 @@ +FC := ifx +FCFLAGS = -std18 +FCFLAGS += -Ofast -xHOST +FCFLAGS += -qopt-zmm-usage=low + +ifeq ($(FC),ifort) + FCFLAGS += -qopt-streaming-stores=always + PARALLEL = -parallel +endif + +DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) +ARRAY_FLAG = -qopenmp $(PARALLEL) +OPENMP_FLAG = -qopenmp +ifeq ($(FC),ifx) + OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 +endif +OPENACC_FLAG = +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/run.sh b/src/fortran/run.sh new file mode 100755 index 0000000..2b41bab --- /dev/null +++ b/src/fortran/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +cat ./run.sh + +if [ `uname -s` == Darwin ] ; then + NUM_HWTHREADS=`sysctl -n hw.ncpu` + MEMORY_BYTES=`sysctl -n hw.memsize` +else + NUM_HWTHREADS=`nproc` + MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` +fi + +M=128 + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=close +export OMP_PLACES=threads + +export ACC_NUM_CORES=${OMP_NUM_THREADS} + +AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" + +for compiler in gcc nvhpc cray oneapi arm amd fj ; do + #if [ "x$compiler" == "xgcc" ] ; then + # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so + #fi + for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do + if [ -f BabelStream.${compiler}.${implementation} ] ; then + echo "BabelStream.${compiler}.${implementation}" + ldd BabelStream.${compiler}.${implementation} + time $AFFCONTROL \ + ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) + fi + done +done diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp new file mode 100644 index 0000000..ebd3633 --- /dev/null +++ b/src/futhark/FutharkStream.cpp @@ -0,0 +1,212 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// Copyright (c) 2022 Troels Henriksen +// University of Copenhagen +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include // For aligned_alloc +#include +#include "FutharkStream.h" + +template +FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) +{ + this->array_size = ARRAY_SIZE; + this->cfg = futhark_context_config_new(); + this->device = "#" + std::to_string(device); +#if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl) + futhark_context_config_set_device(cfg, this->device.c_str()); +#endif + this->ctx = futhark_context_new(cfg); + this->a = NULL; + this->b = NULL; + this->c = NULL; +} + +template <> +FutharkStream::~FutharkStream() +{ + if (this->a) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->a); + } + if (this->b) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b); + } + if (this->c) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + } + futhark_context_free(this->ctx); + futhark_context_config_free(this->cfg); +} + +template <> +FutharkStream::~FutharkStream() +{ + if (this->a) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a); + } + if (this->b) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b); + } + if (this->c) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + } + futhark_context_free(this->ctx); + futhark_context_config_free(this->cfg); +} + +template <> +void FutharkStream::init_arrays(float initA, float initB, float initC) { + int array_size = this->array_size; + float *a = new float[array_size]; + float *b = new float[array_size]; + float *c = new float[array_size]; + for (int i = 0; i < array_size; i++) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + this->a = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, a, array_size); + this->b = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, b, array_size); + this->c = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, c, array_size); + futhark_context_sync(this->ctx); + delete[] a; + delete[] b; + delete[] c; +} + +template <> +void FutharkStream::init_arrays(double initA, double initB, double initC) { + int array_size = this->array_size; + double *a = new double[array_size]; + double *b = new double[array_size]; + double *c = new double[array_size]; + for (int i = 0; i < array_size; i++) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + this->a = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, a, array_size); + this->b = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, b, array_size); + this->c = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, c, array_size); + futhark_context_sync(this->ctx); + delete[] a; + delete[] b; + delete[] c; +} + +template <> +void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data()); + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data()); + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data()); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data()); + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data()); + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data()); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::copy() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_copy(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::copy() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + futhark_entry_f64_copy(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::mul() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b); + futhark_entry_f32_mul(this->ctx, (futhark_f32_1d**)&this->b, (futhark_f32_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::mul() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b); + futhark_entry_f64_mul(this->ctx, (futhark_f64_1d**)&this->b, (futhark_f64_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::add() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_add(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::add() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + futhark_entry_f64_add(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::triad() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_triad(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::triad() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a); + futhark_entry_f64_triad(this->ctx, (futhark_f64_1d**)&this->a, (futhark_f64_1d*)this->b, (futhark_f64_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::nstream() { + futhark_f32_1d* d; + futhark_entry_f32_triad(this->ctx, &d, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + this->c = d; + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::nstream() { + futhark_f64_1d* d; + futhark_entry_f64_triad(this->ctx, &d, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + this->c = d; + futhark_context_sync(this->ctx); +} + +template <> +float FutharkStream::dot() { + float res; + futhark_entry_f32_dot(this->ctx, &res, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); + return res; +} + +template <> +double FutharkStream::dot() { + double res; + futhark_entry_f64_dot(this->ctx, &res, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_context_sync(this->ctx); + return res; +} + +void listDevices(void) +{ + std::cout << "Device selection not supported." << std::endl; +} + +template class FutharkStream; +template class FutharkStream; diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h new file mode 100644 index 0000000..6290e79 --- /dev/null +++ b/src/futhark/FutharkStream.h @@ -0,0 +1,60 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// Copyright (c) 2022 Troels Henriksen +// University of Copenhagen +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" +#include "babelstream.h" + +#if defined(FUTHARK_BACKEND_c) +#define IMPLEMENTATION_STRING "Futhark (sequential)" +#elif defined(FUTHARK_BACKEND_multicore) +#define IMPLEMENTATION_STRING "Futhark (parallel CPU)" +#elif defined(FUTHARK_BACKEND_opencl) +#define IMPLEMENTATION_STRING "Futhark (OpencL)" +#elif defined(FUTHARK_BACKEND_cuda) +#define IMPLEMENTATION_STRING "Futhark (CUDA)" +#else +#define IMPLEMENTATION_STRING "Futhark (unknown backend)" +#endif + +template +class FutharkStream : public Stream +{ +protected: + // Size of arrays + int array_size; + // For device selection. + std::string device; + + // Futhark stuff + struct futhark_context_config *cfg; + struct futhark_context *ctx; + + // Device side arrays + void* a; + void* b; + void* c; + +public: + FutharkStream(const int, int); + ~FutharkStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; +}; diff --git a/src/futhark/babelstream.fut b/src/futhark/babelstream.fut new file mode 100644 index 0000000..d513a60 --- /dev/null +++ b/src/futhark/babelstream.fut @@ -0,0 +1,62 @@ +module type kernels = { + type t + val copy [n] : [n]t -> *[n]t + val mul [n] : t -> [n]t -> [n]t + val add [n] : [n]t -> [n]t -> [n]t + val triad [n] : t -> [n]t -> [n]t -> [n]t + val dot [n] : [n]t -> [n]t -> t + -- Uniqueness allows nstream to mutate the 'a' array. + val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t +} + +module kernels (P: real) : kernels with t = P.t = { + type t = P.t + def copy = copy + def mul scalar c = map (P.*scalar) c + def add = map2 (P.+) + def triad scalar b c = map2 (P.+) b (map (P.* scalar) c) + def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b) + def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c)) +} + +module f32_kernels = kernels f32 +def f32_start_scalar : f32 = 0.4 +entry f32_copy = f32_kernels.copy +entry f32_mul = f32_kernels.mul f32_start_scalar +entry f32_add = f32_kernels.add +entry f32_triad = f32_kernels.triad f32_start_scalar +entry f32_nstream = f32_kernels.nstream f32_start_scalar +entry f32_dot = f32_kernels.dot + +module f64_kernels = kernels f64 +def f64_start_scalar : f64 = 0.4 +entry f64_copy = f64_kernels.copy +entry f64_mul = f64_kernels.mul f64_start_scalar +entry f64_add = f64_kernels.add +entry f64_triad = f64_kernels.triad f64_start_scalar +entry f64_nstream = f64_kernels.nstream f64_start_scalar +entry f64_dot = f64_kernels.dot + +-- == +-- entry: f32_copy f32_mul +-- random input { [33554432]f32 } + +-- == +-- entry: f32_add f32_dot f32_triad +-- random input { [33554432]f32 [33554432]f32 } + +-- == +-- entry: f32_nstream +-- random input { [33554432]f32 [33554432]f32 [33554432]f32 } + +-- == +-- entry: f64_copy f64_mul +-- random input { [33554432]f64 } + +-- == +-- entry: f64_add f64_dot f64_triad +-- random input { [33554432]f64 [33554432]f64 } + +-- == +-- entry: f64_nstream +-- random input { [33554432]f64 [33554432]f64 [33554432]f64 } diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake new file mode 100644 index 0000000..edd21fa --- /dev/null +++ b/src/futhark/model.cmake @@ -0,0 +1,55 @@ +# Use +# +# cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark +# +# to use the Futhark backend, where 'foo' must be one of 'multicore', +# 'c', 'opencl', or 'cuda'. Defaults to 'multicore'. +# +# Use -DFUTHARK_COMPILER to set the path to the Futhark compiler +# binary. Defaults to 'futhark' on the PATH. + +register_flag_optional(FUTHARK_BACKEND + "Use a specific Futhark backend, possible options are: + - c + - multicore + - opencl + - cuda" + "multicore") + +register_flag_optional(FUTHARK_COMPILER + "Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH" + "futhark") + +macro(setup) + add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/babelstream.c + ${CMAKE_CURRENT_BINARY_DIR}/babelstream.h + COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND} + --library src/futhark/babelstream.fut + -o ${CMAKE_CURRENT_BINARY_DIR}/babelstream + DEPENDS src/futhark/babelstream.fut + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + VERBATIM + ) + if (${FUTHARK_BACKEND} STREQUAL "c") + # Nothing to do. + elseif (${FUTHARK_BACKEND} STREQUAL "multicore") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + register_link_library(Threads::Threads) + elseif (${FUTHARK_BACKEND} STREQUAL "opencl") + find_package(OpenCL REQUIRED) + register_link_library(OpenCL::OpenCL) + elseif (${FUTHARK_BACKEND} STREQUAL "cuda") + find_package(CUDA REQUIRED) + register_link_library("nvrtc" "cuda" "cudart") + else () + message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}") + endif() +endmacro() + +macro(setup_target) + target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c") + include_directories("${CMAKE_CURRENT_BINARY_DIR}") +endmacro() diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index fbc3b71..ed4ef77 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -9,7 +9,7 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 + void check_error(void) { @@ -45,34 +45,63 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif array_size = ARRAY_SIZE; + // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) + dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); - // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 3; + + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. + // This creates an array on the host which is visible to the device. However, it requires + // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host + // after it has been passed through to a kernel. + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create device buffers - hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + // Create device buffers +#if defined(MANAGED) + hipMallocManaged(&d_a, array_bytes); check_error(); - hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + hipMallocManaged(&d_b, array_bytes); check_error(); - hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + hipMallocManaged(&d_c, array_bytes); check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); +#else + hipMalloc(&d_a, array_bytes); check_error(); + hipMalloc(&d_b, array_bytes); + check_error(); + hipMalloc(&d_c, array_bytes); + check_error(); +#endif } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -80,15 +109,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -97,7 +124,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); hipDeviceSynchronize(); check_error(); @@ -106,27 +133,37 @@ void HIPStream::init_arrays(T initA, T initB, T initC) template void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + // Copy device memory to host +#if defined(PAGEFAULT) || defined(MANAGED) + hipDeviceSynchronize(); + for (int i = 0; i < array_size; i++) + { + a[i] = d_a[i]; + b[i] = d_b[i]; + c[i] = d_c[i]; + } +#else hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); +#endif } - template __global__ void copy_kernel(const T * a, T * c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -136,14 +173,14 @@ template __global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -152,14 +189,14 @@ void HIPStream::mul() template __global__ void add_kernel(const T * a, const T * b, T * c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -169,14 +206,14 @@ template __global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -186,32 +223,32 @@ template __global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; - tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) + tb_sum[local_i] = {}; + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { __syncthreads(); if (local_i < offset) @@ -221,20 +258,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) } if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); + check_error(); + hipDeviceSynchronize(); check_error(); - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - - T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + T sum{}; + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 44a2893..3c603e0 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,13 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { + // Make sure that either: + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + // Take into account the datatype size + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + protected: // Size of arrays int array_size; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; @@ -29,7 +47,6 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; public: diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4..a63efec 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,13 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use HIP Managed Memory. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") + macro(setup) # nothing to do here as hipcc does everything correctly, what a surprise! + register_definitions(${MEM}) endmacro() \ No newline at end of file diff --git a/src/java/java-stream/pom.xml b/src/java/java-stream/pom.xml index d28a3d5..8cf229f 100644 --- a/src/java/java-stream/pom.xml +++ b/src/java/java-stream/pom.xml @@ -7,12 +7,12 @@ java-stream javastream - 4.0 + 5.0 UTF-8 UTF-8 - 5.7.2 + 5.9.2 @@ -27,19 +27,19 @@ com.beust jcommander - 1.81 + 1.82 tornado tornado-api - 0.9 + 0.15.1 com.aparapi aparapi - 2.0.0 + 3.0.0 diff --git a/src/java/java-stream/src/main/java/javastream/JavaStream.java b/src/java/java-stream/src/main/java/javastream/JavaStream.java index 7ab96cb..4fdb229 100644 --- a/src/java/java-stream/src/main/java/javastream/JavaStream.java +++ b/src/java/java-stream/src/main/java/javastream/JavaStream.java @@ -56,7 +56,7 @@ public abstract class JavaStream { protected abstract T dot(); - protected abstract Data data(); + protected abstract Data readArrays(); public static class EnumeratedStream extends JavaStream { @@ -113,8 +113,8 @@ public abstract class JavaStream { } @Override - public Data data() { - return actual.data(); + public Data readArrays() { + return actual.readArrays(); } } @@ -140,6 +140,14 @@ public abstract class JavaStream { return Duration.ofNanos(end - start); } + final Duration runInitArrays() { + return timed(this::initArrays); + } + + final SimpleImmutableEntry> runReadArrays() { + return timed(this::readArrays); + } + final SimpleImmutableEntry, T> runAll(int times) { Timings timings = new Timings<>(); T lastSum = null; diff --git a/src/java/java-stream/src/main/java/javastream/Main.java b/src/java/java-stream/src/main/java/javastream/Main.java index 2442128..ecd9499 100644 --- a/src/java/java-stream/src/main/java/javastream/Main.java +++ b/src/java/java-stream/src/main/java/javastream/Main.java @@ -128,6 +128,40 @@ public class Main { } } + @SuppressWarnings("unchecked") + static void showInit( + int totalBytes, double megaScale, Options opt, Duration init, Duration read) { + List> setup = + Arrays.asList( + new SimpleImmutableEntry<>("Init", durationToSeconds(init)), + new SimpleImmutableEntry<>("Read", durationToSeconds(read))); + if (opt.csv) { + tabulateCsv( + true, + setup.stream() + .map( + x -> + Arrays.asList( + new SimpleImmutableEntry<>("function", x.getKey()), + new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""), + new SimpleImmutableEntry<>("sizeof", totalBytes + ""), + new SimpleImmutableEntry<>( + "max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", + ((megaScale * (double) totalBytes / x.getValue())) + ""), + new SimpleImmutableEntry<>("runtime", x.getValue() + ""))) + .toArray(List[]::new)); + } else { + for (Entry e : setup) { + System.out.printf( + "%s: %.5f s (%.5f M%sBytes/sec)%n", + e.getKey(), + e.getValue(), + megaScale * (double) totalBytes / e.getValue(), + opt.mibibytes ? "i" : ""); + } + } + } + static boolean run( String name, Config config, Function, JavaStream> mkStream) { @@ -183,35 +217,46 @@ public class Main { JavaStream stream = mkStream.apply(config); - stream.initArrays(); - + Duration init = stream.runInitArrays(); final boolean ok; switch (config.benchmark) { case ALL: - Entry, T> results = stream.runAll(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.of(results.getValue())); - Timings timings = results.getKey(); - tabulateCsv( - opt.csv, - mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), - mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), - mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), - mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), - mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); - break; + { + Entry, T> results = stream.runAll(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue())); + Timings timings = results.getKey(); + tabulateCsv( + opt.csv, + mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); + break; + } case NSTREAM: - List nstreamResults = stream.runNStream(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.empty()); - tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); - break; + { + List nstreamResults = stream.runNStream(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.empty()); + tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); + break; + } case TRIAD: - Duration triadResult = stream.runTriad(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.empty()); - int triadTotalBytes = 3 * arrayBytes * opt.numtimes; - double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); - System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); - System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); - break; + { + Duration triadResult = stream.runTriad(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.empty()); + int triadTotalBytes = 3 * arrayBytes * opt.numtimes; + double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); + System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); + System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); + break; + } default: throw new AssertionError(); } @@ -337,7 +382,7 @@ public class Main { } } - private static final String VERSION = "4.0"; + private static final String VERSION = "5.0"; private static final float START_SCALAR = 0.4f; private static final float START_A = 0.1f; diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java index ab2de52..052c807 100644 --- a/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java +++ b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java @@ -122,7 +122,7 @@ public final class AparapiStreams { } @Override - public Data data() { + public Data readArrays() { return kernels.syncAndDispose(); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java index 7f210fa..8075603 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java @@ -86,7 +86,7 @@ final class GenericPlainStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(a, b, c); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java index 1e65b8f..3cacf3a 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java @@ -80,7 +80,7 @@ final class GenericStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(a, b, c); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java index 26406a6..1b54bc3 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java @@ -78,7 +78,7 @@ final class SpecialisedDoubleStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java index 6c414c1..4d8c137 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java @@ -78,7 +78,7 @@ final class SpecialisedFloatStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java index afda2ef..c4f38d0 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java @@ -78,7 +78,7 @@ final class SpecialisedPlainDoubleStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java index 9ccee53..5178ed2 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java @@ -78,7 +78,7 @@ final class SpecialisedPlainFloatStream extends JavaStream { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java index d936df6..a65c32a 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java @@ -4,8 +4,8 @@ import java.util.List; import java.util.stream.Collectors; import javastream.JavaStream; import javastream.Main.Config; -import uk.ac.manchester.tornado.api.TaskSchedule; -import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; +import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; import uk.ac.manchester.tornado.api.common.TornadoDevice; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; @@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream extends JavaStream { protected final TornadoDevice device; - protected TaskSchedule copyTask; - protected TaskSchedule mulTask; - protected TaskSchedule addTask; - protected TaskSchedule triadTask; - protected TaskSchedule nstreamTask; - protected TaskSchedule dotTask; + protected TornadoExecutionPlan copyTask; + protected TornadoExecutionPlan mulTask; + protected TornadoExecutionPlan addTask; + protected TornadoExecutionPlan triadTask; + protected TornadoExecutionPlan nstreamTask; + protected TornadoExecutionPlan dotTask; GenericTornadoVMStream(Config config) { super(config); try { - TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime(); + TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime(); List devices = TornadoVMStreams.enumerateDevices(runtime); device = devices.get(config.options.device); @@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream extends JavaStream { } } - protected static TaskSchedule mkSchedule() { - return new TaskSchedule(""); - } - @Override public List listDevices() { return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() @@ -55,12 +51,12 @@ abstract class GenericTornadoVMStream extends JavaStream { @Override public void initArrays() { - this.copyTask.warmup(); - this.mulTask.warmup(); - this.addTask.warmup(); - this.triadTask.warmup(); - this.nstreamTask.warmup(); - this.dotTask.warmup(); + this.copyTask.withWarmUp(); + this.mulTask.withWarmUp(); + this.addTask.withWarmUp(); + this.triadTask.withWarmUp(); + this.nstreamTask.withWarmUp(); + this.dotTask.withWarmUp(); } @Override diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java index 7712e31..c10153e 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java @@ -2,8 +2,11 @@ package javastream.tornadovm; import java.util.Arrays; import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskGraph; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Reduce; +import uk.ac.manchester.tornado.api.enums.DataTransferMode; final class SpecialisedDouble extends GenericTornadoVMStream { @@ -49,7 +52,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream { private final double[] a, b, c; private final double[] dotSum; - @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + @SuppressWarnings({"DuplicatedCode"}) SpecialisedDouble(Config config) { super(config); final int size = config.options.arraysize; @@ -58,12 +61,43 @@ final class SpecialisedDouble extends GenericTornadoVMStream { b = new double[size]; c = new double[size]; dotSum = new double[1]; - this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c); - this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar); - this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c); - this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar); - this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar); - this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum); + this.copyTask = + new TornadoExecutionPlan( + new TaskGraph("copy") + .task("copy", SpecialisedDouble::copy, size, a, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c) + .snapshot()); + this.mulTask = + new TornadoExecutionPlan( + new TaskGraph("mul") + .task("mul", SpecialisedDouble::mul, size, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c) + .snapshot()); + this.addTask = + new TornadoExecutionPlan( + new TaskGraph("add") + .task("add", SpecialisedDouble::add, size, a, b, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.triadTask = + new TornadoExecutionPlan( + new TaskGraph("triad") + .task("triad", SpecialisedDouble::triad, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.nstreamTask = + new TornadoExecutionPlan( + new TaskGraph("nstream") + .task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.dotTask = + new TornadoExecutionPlan( + new TaskGraph("dot") + .task("dot", SpecialisedDouble::dot_, a, b, dotSum) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b) + .transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum}) + .snapshot()); } @Override @@ -72,7 +106,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream { Arrays.fill(a, config.initA); Arrays.fill(b, config.initB); Arrays.fill(c, config.initC); - TornadoVMStreams.xferToDevice(device, a, b, c); + TornadoVMStreams.allocAndXferToDevice(device, a, b, c); } @Override @@ -81,7 +115,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream { } @Override - public Data data() { + public Data readArrays() { TornadoVMStreams.xferFromDevice(device, a, b, c); return new Data<>(boxed(a), boxed(b), boxed(c)); } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java index e61cfe9..0f3fffa 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java @@ -2,8 +2,11 @@ package javastream.tornadovm; import java.util.Arrays; import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskGraph; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Reduce; +import uk.ac.manchester.tornado.api.enums.DataTransferMode; final class SpecialisedFloat extends GenericTornadoVMStream { @@ -49,7 +52,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream { private final float[] a, b, c; private final float[] dotSum; - @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + @SuppressWarnings({"DuplicatedCode"}) SpecialisedFloat(Config config) { super(config); final int size = config.options.arraysize; @@ -58,12 +61,43 @@ final class SpecialisedFloat extends GenericTornadoVMStream { b = new float[size]; c = new float[size]; dotSum = new float[1]; - this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c); - this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar); - this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c); - this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar); - this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar); - this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum); + this.copyTask = + new TornadoExecutionPlan( + new TaskGraph("copy") + .task("copy", SpecialisedFloat::copy, size, a, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c) + .snapshot()); + this.mulTask = + new TornadoExecutionPlan( + new TaskGraph("mul") + .task("mul", SpecialisedFloat::mul, size, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c) + .snapshot()); + this.addTask = + new TornadoExecutionPlan( + new TaskGraph("add") + .task("add", SpecialisedFloat::add, size, a, b, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.triadTask = + new TornadoExecutionPlan( + new TaskGraph("triad") + .task("triad", SpecialisedFloat::triad, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.nstreamTask = + new TornadoExecutionPlan( + new TaskGraph("nstream") + .task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.dotTask = + new TornadoExecutionPlan( + new TaskGraph("dot") + .task("dot", SpecialisedFloat::dot_, a, b, dotSum) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b) + .transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum}) + .snapshot()); } @Override @@ -72,7 +106,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream { Arrays.fill(a, config.initA); Arrays.fill(b, config.initB); Arrays.fill(c, config.initC); - TornadoVMStreams.xferToDevice(device, a, b, c); + TornadoVMStreams.allocAndXferToDevice(device, a, b, c); } @Override @@ -81,7 +115,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream { } @Override - public Data data() { + public Data readArrays() { TornadoVMStreams.xferFromDevice(device, a, b, c); return new Data<>(boxed(a), boxed(b), boxed(c)); } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java index 68eecad..a43c7c8 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java @@ -1,36 +1,46 @@ package javastream.tornadovm; +import java.util.Arrays; import java.util.List; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import javastream.JavaStream; import javastream.Main.Config; -import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; +import uk.ac.manchester.tornado.api.common.Event; import uk.ac.manchester.tornado.api.common.TornadoDevice; -import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState; +import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState; +import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; public final class TornadoVMStreams { private TornadoVMStreams() {} - static void xferToDevice(TornadoDevice device, Object... xs) { + static void allocAndXferToDevice(TornadoDevice device, Object... xs) { for (Object x : xs) { TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + device.allocateObjects( + new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)}); List writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); } } static void xferFromDevice(TornadoDevice device, Object... xs) { - for (Object x : xs) { - TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); - device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn(); - } + Arrays.stream(xs) + .map( + x -> { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + return device.resolveEvent( + device.streamOut(x, 0, state.getDeviceState(device), null)); + }) + .collect(Collectors.toList()) + .forEach(Event::waitOn); } - static List enumerateDevices(TornadoRuntimeCI runtime) { + static List enumerateDevices(TornadoRuntimeInterface runtime) { return IntStream.range(0, runtime.getNumDrivers()) .mapToObj(runtime::getDriver) .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) diff --git a/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml index 170213c..9415ddc 100644 --- a/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml +++ b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml @@ -1,415 +1,423 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "05982ec0602af8ada9509107382dd6c8b21db9b9" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Preferences", "Printf", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "95437cf4c0ad651ca8463475de8af6a6935e23bd" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.6.1" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] -[[ArgParse]] + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[Base64]] +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] -deps = ["Libdl", "Logging", "SHA"] -git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.10" - -[[Bzip2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" -uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.8+0" - -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[ConstructionBase]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" -uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[Dates]] +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.9.3" + +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Elfutils_jll]] -deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" -uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" - -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "9.0.0" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +version = "0.24.5" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" -uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" - -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.2.2" + +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[LLVM]] +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "6.2.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.25+0" -[[LibCURL]] +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" + +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] -git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" -uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" -version = "1.8.7+0" - -[[Libglvnd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" -uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" - -[[Libgpg_error_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" -uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" -version = "1.42.0+0" - -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" -uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" - -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[Logging]] +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.26" + + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" -uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" - -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" + +[[deps.OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" -uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" - -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" -uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" - -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" -uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" - -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Requires]] +[[deps.Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.2.0" -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" -uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[Statistics]] +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "2.3.1" + + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] + + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" -[[XSLT_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] -git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" -uuid = "aed1982a-8fda-507f-9586-7b0439959a61" -version = "1.1.34+0" +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" - -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" -uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" - -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" -uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" - -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" -uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" - -[[Xorg_libXext_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" -uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" -version = "1.3.4+4" - -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" -uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" - -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" -uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" - -[[Xorg_xorgproto_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" -uuid = "c4d99508-4286-5418-9131-c86396af500b" -version = "2019.2.0+2" - -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" -uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" - -[[Zlib_jll]] +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" -uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" -version = "1.3.1+0" +[[deps.libLLVM_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" -[[fts_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" -uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" -uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" - -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" -uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" - -[[nghttp2_jll]] +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[obstack_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" -uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" -version = "1.2.2+0" - -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/AMDGPU/Project.toml b/src/julia/JuliaStream.jl/AMDGPU/Project.toml index 5ab8447..66596df 100644 --- a/src/julia/JuliaStream.jl/AMDGPU/Project.toml +++ b/src/julia/JuliaStream.jl/AMDGPU/Project.toml @@ -4,4 +4,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/CUDA/Manifest.toml b/src/julia/JuliaStream.jl/CUDA/Manifest.toml index 92af4d1..cf7c0e9 100644 --- a/src/julia/JuliaStream.jl/CUDA/Manifest.toml +++ b/src/julia/JuliaStream.jl/CUDA/Manifest.toml @@ -1,332 +1,555 @@ # This file is machine-generated - editing it directly is not advised -[[AbstractFFTs]] +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "6909ef39c97ad6037791040bed70b7aa111e1f64" + +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] -[[ArgParse]] + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Printf", "Random", "Test"] -git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.2.0" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" -[[Base64]] +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "1f8ebf85abb7d1eff965730e592794a27c1350d8" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.6.0" +version = "5.0.0" -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" + [deps.CUDA.extensions] + SpecialFunctionsExt = "SpecialFunctions" -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.2" + [deps.CUDA.weakdeps] + SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "35a37bb72b35964f2895c12c687ae263b4ac170c" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.6.0+3" + +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" +version = "0.2.2" + +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "bfe5a693a11522d58392f742243f2b50dc27afd6" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.9.2+0" + +[[deps.ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.11.4" + +[[deps.Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] +git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.12.10" + +[[deps.Compat]] +deps = ["UUIDs"] +git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +version = "4.9.0" +weakdeps = ["Dates", "LinearAlgebra"] -[[CompilerSupportLibraries_jll]] + [deps.Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" + +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[Dates]] +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[deps.DataAPI]] +git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.15.0" + +[[deps.DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.6.1" + +[[deps.DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.15" + +[[deps.DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" - -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.FixedPointNumbers]] +deps = ["Statistics"] +git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.8.4" + +[[deps.Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "9.0.0" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.10" +version = "0.24.5" -[[InteractiveUtils]] +[[deps.InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" + +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" -uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" - -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +[[deps.InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" version = "1.3.0" -[[LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +[[deps.IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" -[[LLVMExtra_jll]] +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.5.0" + +[[deps.JuliaNVTXCallbacks_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" +uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" +version = "0.2.1+0" -[[LazyArtifacts]] +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "6.2.1" + +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.25+0" + +[[deps.LaTeXStrings]] +git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.3.0" + +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" - -[[Logging]] +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[Markdown]] +[[deps.MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.11" + +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[deps.Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.1.0" -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NetworkOptions]] +[[deps.NVTX]] +deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] +git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34" +uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +version = "0.3.3" + +[[deps.NVTX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" +uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" +version = "3.1.0+2" + +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenLibm_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" -[[OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" - -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Parsers]] +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.7.2" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.3" + +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.2.7" + +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.2.0" -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.0" -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.1.1" + +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.0.0" +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] -[[Statistics]] + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.StringManipulation]] +deps = ["PrecompileTools"] +git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.4" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.1" + +[[deps.Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"] +git-tree-sha1 = "a1f34829d5ac0ef499f6d84428bd6b4c71f02ead" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.11.0" + +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[Zlib_jll]] +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[nghttp2_jll]] +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/CUDA/Project.toml b/src/julia/JuliaStream.jl/CUDA/Project.toml index e50582e..22cdf06 100644 --- a/src/julia/JuliaStream.jl/CUDA/Project.toml +++ b/src/julia/JuliaStream.jl/CUDA/Project.toml @@ -4,4 +4,4 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml index 91093a7..a5f5053 100644 --- a/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -1,557 +1,735 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "d273a081dfaa413b3d1144a4c6d874ffbde3e0d7" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "HIP_jll", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "MsgPack", "ObjectFile", "Pkg", "Preferences", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "SpecialFunctions", "Statistics", "TimespanLogging", "hsa_rocr_jll", "rocBLAS_jll", "rocRAND_jll", "rocSPARSE_jll"] +git-tree-sha1 = "06f51480c4fbd88edae71c7e60fd9a7362a579f2" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.4.8" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] -[[ArgParse]] + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" version = "0.1.0" -[[Base64]] +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] +[[deps.BinaryProvider]] deps = ["Libdl", "Logging", "SHA"] git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" version = "0.5.10" -[[Bzip2_jll]] +[[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "335b3d2373733919b4972a51215a6840c7a33828" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"] +git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.2" +version = "4.0.1" -[[CUDAKernels]] -deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +[[deps.CUDAKernels]] +deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca" uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" -version = "0.3.0" +version = "0.4.7" -[[Cassette]] -git-tree-sha1 = "6ce3cd755d4130d43bab24ea5181e77b89b51839" -uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.9" +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.2.0+0" -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "d6b227a1cfa63ae89cb969157c6789e36b7c9624" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" version = "0.1.2" -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "ed00f777d2454c45f5f49634ed0a589da07ee0b0" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.2.4+1" -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[ConstructionBase]] +[[deps.ConstructionBase]] deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +git-tree-sha1 = "c53fc348ca4d40d7b371e71fd52251839080cbc9" uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +version = "1.5.4" -[[Dates]] + [deps.ConstructionBase.extensions] + ConstructionBaseIntervalSetsExt = "IntervalSets" + ConstructionBaseStaticArraysExt = "StaticArrays" + + [deps.ConstructionBase.weakdeps] + IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] +[[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Elfutils_jll]] +[[deps.Elfutils_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +git-tree-sha1 = "6880e234507b4b4eaabccb80c2316458d608f1c7" uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" +version = "0.182.0+1" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[Future]] +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" -[[GPUCompiler]] +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +version = "0.17.3" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +[[deps.HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "hsakmt_roct_jll", "rocminfo_jll"] +git-tree-sha1 = "6b91ab9bea10197163cb19ee57e52a1ebe0b28dc" uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" +version = "5.4.4+0" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[KernelAbstractions]] -deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] -git-tree-sha1 = "cb7d8b805413025a5bc866fc036b426223ffc059" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "cf9cae1c4c1ff83f6c02cfaf01698f05448e8325" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.7.2" +version = "0.8.6" -[[LLVM]] +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "4.17.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.18+0" -[[LazyArtifacts]] +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" + +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] +[[deps.Libgcrypt_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" version = "1.8.7+0" -[[Libglvnd_jll]] +[[deps.Libglvnd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733" uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" +version = "1.6.0+0" -[[Libgpg_error_jll]] +[[deps.Libgpg_error_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" version = "1.42.0+0" -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +[[deps.Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" +version = "1.17.0+0" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" -[[Logging]] + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +[[deps.MsgPack]] +deps = ["Serialization"] +git-tree-sha1 = "fc8c15ca848b902015bd4a745d350f02cf791c2a" +uuid = "99f44e22-a591-53d1-9472-aa23ef4bd671" +version = "1.2.0" + +[[deps.NUMA_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "3da12251003f08e819c907c645879c362206f5b4" uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" +version = "2.0.14+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenLibm_jll]] +[[deps.ObjectFile]] +deps = ["Reexport", "StructIO"] +git-tree-sha1 = "55ce61d43409b1fb0279d1781bf3b0f22c83ab3b" +uuid = "d8793406-e978-5875-9003-1fc021f44a92" +version = "0.3.7" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCKernels]] -deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "5e13faac6e566cb30c6620ad0be967a747121aeb" +[[deps.ROCKernels]] +deps = ["AMDGPU", "Adapt", "KernelAbstractions", "LLVM", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4d4973642639c249ccf8f50392f7f04ee3fcca22" uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" -version = "0.2.2" +version = "0.3.5" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +[[deps.ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "7a3f25087b24d33b89f2e32cccd26af39275d14d" uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" +version = "5.4.4+0" -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +[[deps.ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] +git-tree-sha1 = "45d5a53be418b740fe740714c8100650aebba041" uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" +version = "5.4.4+0" -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +[[deps.ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "f7cbafcda3eec208831f22ae7816f34a90ce8e0f" uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" +version = "5.4.4+0" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +[[deps.Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] +git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" +version = "1.1.1" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "3c76dde64d03699e074ac02eb2e8ba8254d428da" + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.13" +version = "1.6.4" +weakdeps = ["Statistics"] -[[Statistics]] + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.StructIO]] +deps = ["Test"] +git-tree-sha1 = "010dc73c7146869c042b49adcdb6bf528c12e859" +uuid = "53d494c1-5632-5724-8f4c-31dff12d585f" +version = "0.3.0" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.TimespanLogging]] +deps = ["Distributed", "Profile"] +git-tree-sha1 = "51be7dd35b0c8a5a613dc7af272d587ea6943d24" +uuid = "a526e669-04d3-4846-9525-c66122c55f63" +version = "0.1.0" + +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" -[[XSLT_jll]] +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.1" + +[[deps.XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] +git-tree-sha1 = "04a51d15436a572301b5abbb9d099713327e9fc4" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.10.4+0" + +[[deps.XSLT_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +[[deps.XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cf2c7de82431ca6f39250d2fc4aacd0daa1675c0" uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" +version = "5.4.4+0" -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +[[deps.Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" +version = "1.8.6+0" -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +[[deps.Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" +version = "1.0.11+0" -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +[[deps.Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" +version = "1.1.4+0" -[[Xorg_libXext_jll]] +[[deps.Xorg_libXext_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" version = "1.3.4+4" -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +[[deps.Xorg_libpciaccess_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "79a09b8c1d3a2659937503788ce11173ba29681b" +uuid = "a65dc6b1-eb27-53a1-bb3e-dea574b5389e" +version = "0.16.0+1" + +[[deps.Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" +version = "0.1.1+0" -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +[[deps.Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "b4bfde5d5b652e22b9c790ad00af08b6d042b97d" uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" +version = "1.15.0+0" -[[Xorg_xorgproto_jll]] +[[deps.Xorg_xorgproto_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" uuid = "c4d99508-4286-5418-9131-c86396af500b" version = "2019.2.0+2" -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +[[deps.Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" +version = "1.5.0+0" -[[Zlib_jll]] +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] +[[deps.argp_standalone_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" version = "1.3.1+0" -[[fts_jll]] +[[deps.fts_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +git-tree-sha1 = "aa21810b841ae26d2fc7f780cb1596b4170a4c49" uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" +version = "1.2.8+0" -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +[[deps.hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "ROCmDeviceLibs_jll", "XML2_jll", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "0458f0ff5d72a270fbab764d354dc35d90b28ba9" uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" +version = "5.4.4+0" -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" +[[deps.hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "libdrm_jll"] +git-tree-sha1 = "49db943b2bf868b1fa2866b93faf4d2222fa28ae" uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" +version = "5.4.4+0" -[[nghttp2_jll]] +[[deps.libLLVM_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libdrm_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libpciaccess_jll"] +git-tree-sha1 = "89b30a68162c12118311b77e57b20c8fa2685496" +uuid = "8e53e030-5e6c-5a89-a30b-be5b7263a166" +version = "2.4.110+0" + +[[deps.msgpack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "dcbef55311e8e3d0a15dbe7dd86900c501ca2359" +uuid = "43dd8cde-e9ee-5d59-924a-18d3f2773c4d" +version = "3.0.1+0" + +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[obstack_jll]] +[[deps.obstack_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" version = "1.2.2+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" + +[[deps.rocBLAS_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "msgpack_jll", "rocminfo_jll"] +git-tree-sha1 = "92d224a9e10a9ad04195d943a2b1bcbdafcaf06a" +uuid = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab" +version = "5.2.3+2" + +[[deps.rocPRIM_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "7a100de0bae8363cbd33fa429d37be45a0247d2c" +uuid = "52935e6f-76c5-5ebb-b227-36676f75be9c" +version = "5.2.3+1" + +[[deps.rocRAND_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "58a35917ddb4d79f7a0c2f6d438a210d2f398e85" +uuid = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996" +version = "5.2.3+1" + +[[deps.rocSPARSE_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocPRIM_jll", "rocminfo_jll"] +git-tree-sha1 = "67bc29d47ab636ef1471e48d7f730c03a0edfcf8" +uuid = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a" +version = "5.2.3+1" + +[[deps.rocminfo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "840acd2135e7bd025870d063e99ff70d05c0de46" +uuid = "5a766526-3cf8-5128-8c31-4f7b7ad60f0e" +version = "5.4.4+0" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml index 71715ff..a328acd 100644 --- a/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml @@ -8,4 +8,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/Manifest.toml b/src/julia/JuliaStream.jl/Manifest.toml index 927a399..cf65e8d 100644 --- a/src/julia/JuliaStream.jl/Manifest.toml +++ b/src/julia/JuliaStream.jl/Manifest.toml @@ -1,605 +1,789 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "d5cae1000e576b2ee3d194306272f6931085d077" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "HIP_jll", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "MsgPack", "ObjectFile", "Pkg", "Preferences", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "SpecialFunctions", "Statistics", "TimespanLogging", "hsa_rocr_jll", "rocBLAS_jll", "rocRAND_jll", "rocSPARSE_jll"] +git-tree-sha1 = "06f51480c4fbd88edae71c7e60fd9a7362a579f2" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.4.8" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] -[[ArgParse]] + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" version = "0.1.0" -[[Base64]] +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] +[[deps.BinaryProvider]] deps = ["Libdl", "Logging", "SHA"] git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" version = "0.5.10" -[[Bzip2_jll]] +[[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "335b3d2373733919b4972a51215a6840c7a33828" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"] +git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.2" +version = "4.0.1" -[[CUDAKernels]] -deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +[[deps.CUDAKernels]] +deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca" uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" -version = "0.3.0" +version = "0.4.7" -[[Cassette]] -git-tree-sha1 = "6ce3cd755d4130d43bab24ea5181e77b89b51839" -uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.9" +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.2.0+0" -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "d6b227a1cfa63ae89cb969157c6789e36b7c9624" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" version = "0.1.2" -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "ed00f777d2454c45f5f49634ed0a589da07ee0b0" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.2.4+1" -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[ConstructionBase]] +[[deps.ConstructionBase]] deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +git-tree-sha1 = "c53fc348ca4d40d7b371e71fd52251839080cbc9" uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +version = "1.5.4" -[[Dates]] + [deps.ConstructionBase.extensions] + ConstructionBaseIntervalSetsExt = "IntervalSets" + ConstructionBaseStaticArraysExt = "StaticArrays" + + [deps.ConstructionBase.weakdeps] + IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] +[[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Elfutils_jll]] +[[deps.Elfutils_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +git-tree-sha1 = "6880e234507b4b4eaabccb80c2316458d608f1c7" uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" +version = "0.182.0+1" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[Future]] +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" -[[GPUCompiler]] +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +version = "0.17.3" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +[[deps.HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "hsakmt_roct_jll", "rocminfo_jll"] +git-tree-sha1 = "6b91ab9bea10197163cb19ee57e52a1ebe0b28dc" uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" +version = "5.4.4+0" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[KernelAbstractions]] -deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] -git-tree-sha1 = "cb7d8b805413025a5bc866fc036b426223ffc059" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "cf9cae1c4c1ff83f6c02cfaf01698f05448e8325" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.7.2" +version = "0.8.6" -[[LLVM]] +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "4.17.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.18+0" -[[LazyArtifacts]] +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" + +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] +[[deps.Libgcrypt_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" version = "1.8.7+0" -[[Libglvnd_jll]] +[[deps.Libglvnd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733" uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" +version = "1.6.0+0" -[[Libgpg_error_jll]] +[[deps.Libgpg_error_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" version = "1.42.0+0" -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +[[deps.Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" +version = "1.17.0+0" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" -[[Logging]] + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NEO_jll]] +[[deps.MsgPack]] +deps = ["Serialization"] +git-tree-sha1 = "fc8c15ca848b902015bd4a745d350f02cf791c2a" +uuid = "99f44e22-a591-53d1-9472-aa23ef4bd671" +version = "1.2.0" + +[[deps.NEO_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c" +git-tree-sha1 = "48337227b88be34125e1b4f5402789694a184f5a" uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" -version = "21.44.21506+0" +version = "22.53.25242+0" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +[[deps.NUMA_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "3da12251003f08e819c907c645879c362206f5b4" uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" +version = "2.0.14+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenLibm_jll]] +[[deps.ObjectFile]] +deps = ["Reexport", "StructIO"] +git-tree-sha1 = "55ce61d43409b1fb0279d1781bf3b0f22c83ab3b" +uuid = "d8793406-e978-5875-9003-1fc021f44a92" +version = "0.3.7" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCKernels]] -deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "5e13faac6e566cb30c6620ad0be967a747121aeb" +[[deps.ROCKernels]] +deps = ["AMDGPU", "Adapt", "KernelAbstractions", "LLVM", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4d4973642639c249ccf8f50392f7f04ee3fcca22" uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" -version = "0.2.2" +version = "0.3.5" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +[[deps.ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "7a3f25087b24d33b89f2e32cccd26af39275d14d" uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" +version = "5.4.4+0" -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +[[deps.ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] +git-tree-sha1 = "45d5a53be418b740fe740714c8100650aebba041" uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" +version = "5.4.4+0" -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +[[deps.ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "f7cbafcda3eec208831f22ae7816f34a90ce8e0f" uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" +version = "5.4.4+0" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[SPIRV_LLVM_Translator_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" -uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" -version = "11.0.0+2" +[[deps.SPIRV_LLVM_Translator_unified_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"] +git-tree-sha1 = "2f9c006df258116f90874e47207229c83d06c845" +uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361" +version = "0.2.0+0" -[[SPIRV_Tools_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +[[deps.SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c" uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" -version = "2021.2.0+0" +version = "2023.2.0+0" -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +[[deps.Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] +git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" +version = "1.1.1" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "3c76dde64d03699e074ac02eb2e8ba8254d428da" + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.13" +version = "1.6.4" +weakdeps = ["Statistics"] -[[Statistics]] + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.StructIO]] +deps = ["Test"] +git-tree-sha1 = "010dc73c7146869c042b49adcdb6bf528c12e859" +uuid = "53d494c1-5632-5724-8f4c-31dff12d585f" +version = "0.3.0" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.TimespanLogging]] +deps = ["Distributed", "Profile"] +git-tree-sha1 = "51be7dd35b0c8a5a613dc7af272d587ea6943d24" +uuid = "a526e669-04d3-4846-9525-c66122c55f63" +version = "0.1.0" + +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" -[[XSLT_jll]] +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.1" + +[[deps.XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] +git-tree-sha1 = "04a51d15436a572301b5abbb9d099713327e9fc4" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.10.4+0" + +[[deps.XSLT_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +[[deps.XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cf2c7de82431ca6f39250d2fc4aacd0daa1675c0" uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" +version = "5.4.4+0" -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +[[deps.Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" +version = "1.8.6+0" -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +[[deps.Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" +version = "1.0.11+0" -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +[[deps.Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" +version = "1.1.4+0" -[[Xorg_libXext_jll]] +[[deps.Xorg_libXext_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" version = "1.3.4+4" -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +[[deps.Xorg_libpciaccess_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "79a09b8c1d3a2659937503788ce11173ba29681b" +uuid = "a65dc6b1-eb27-53a1-bb3e-dea574b5389e" +version = "0.16.0+1" + +[[deps.Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" +version = "0.1.1+0" -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +[[deps.Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "b4bfde5d5b652e22b9c790ad00af08b6d042b97d" uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" +version = "1.15.0+0" -[[Xorg_xorgproto_jll]] +[[deps.Xorg_xorgproto_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" uuid = "c4d99508-4286-5418-9131-c86396af500b" version = "2019.2.0+2" -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +[[deps.Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" +version = "1.5.0+0" -[[Zlib_jll]] +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] +[[deps.argp_standalone_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" version = "1.3.1+0" -[[fts_jll]] +[[deps.fts_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +git-tree-sha1 = "aa21810b841ae26d2fc7f780cb1596b4170a4c49" uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" +version = "1.2.8+0" -[[gmmlib_jll]] +[[deps.gmmlib_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc" uuid = "09858cae-167c-5acb-9302-fddc6874d481" -version = "21.2.1+0" +version = "22.3.0+0" -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +[[deps.hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "ROCmDeviceLibs_jll", "XML2_jll", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "0458f0ff5d72a270fbab764d354dc35d90b28ba9" uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" +version = "5.4.4+0" -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" +[[deps.hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "libdrm_jll"] +git-tree-sha1 = "49db943b2bf868b1fa2866b93faf4d2222fa28ae" uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" +version = "5.4.4+0" -[[libigc_jll]] +[[deps.libLLVM_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libdrm_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libpciaccess_jll"] +git-tree-sha1 = "89b30a68162c12118311b77e57b20c8fa2685496" +uuid = "8e53e030-5e6c-5a89-a30b-be5b7263a166" +version = "2.4.110+0" + +[[deps.libigc_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3" +git-tree-sha1 = "d577d44c9e92244cf60fbc183cb5506860916647" uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" -version = "1.0.8744+0" +version = "1.0.12812+0" -[[nghttp2_jll]] +[[deps.msgpack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "dcbef55311e8e3d0a15dbe7dd86900c501ca2359" +uuid = "43dd8cde-e9ee-5d59-924a-18d3f2773c4d" +version = "3.0.1+0" + +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[obstack_jll]] +[[deps.obstack_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" version = "1.2.2+0" -[[oneAPI]] -deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] -git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce" +[[deps.oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"] +git-tree-sha1 = "1e562c5fc737870053e62c6001d742545000ee24" uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" -version = "0.2.1" +version = "1.0.2" -[[oneAPI_Level_Zero_Headers_jll]] +[[deps.oneAPI_Level_Zero_Headers_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464" +git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f" uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" -version = "1.2.43+0" +version = "1.6.3+0" -[[oneAPI_Level_Zero_Loader_jll]] +[[deps.oneAPI_Level_Zero_Loader_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889" +git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638" uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" -version = "1.5.0+0" +version = "1.11.0+0" -[[p7zip_jll]] +[[deps.oneAPI_Support_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219" +uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36" +version = "0.2.2+0" + +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" + +[[deps.rocBLAS_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "msgpack_jll", "rocminfo_jll"] +git-tree-sha1 = "92d224a9e10a9ad04195d943a2b1bcbdafcaf06a" +uuid = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab" +version = "5.2.3+2" + +[[deps.rocPRIM_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "7a100de0bae8363cbd33fa429d37be45a0247d2c" +uuid = "52935e6f-76c5-5ebb-b227-36676f75be9c" +version = "5.2.3+1" + +[[deps.rocRAND_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "58a35917ddb4d79f7a0c2f6d438a210d2f398e85" +uuid = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996" +version = "5.2.3+1" + +[[deps.rocSPARSE_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocPRIM_jll", "rocminfo_jll"] +git-tree-sha1 = "67bc29d47ab636ef1471e48d7f730c03a0edfcf8" +uuid = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a" +version = "5.2.3+1" + +[[deps.rocminfo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "840acd2135e7bd025870d063e99ff70d05c0de46" +uuid = "5a766526-3cf8-5128-8c31-4f7b7ad60f0e" +version = "5.4.4+0" diff --git a/src/julia/JuliaStream.jl/Project.toml b/src/julia/JuliaStream.jl/Project.toml index f8095e0..76c9202 100644 --- a/src/julia/JuliaStream.jl/Project.toml +++ b/src/julia/JuliaStream.jl/Project.toml @@ -16,4 +16,4 @@ ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/Threaded/Manifest.toml b/src/julia/JuliaStream.jl/Threaded/Manifest.toml index dc0737e..5445f32 100644 --- a/src/julia/JuliaStream.jl/Threaded/Manifest.toml +++ b/src/julia/JuliaStream.jl/Threaded/Manifest.toml @@ -1,31 +1,35 @@ # This file is machine-generated - editing it directly is not advised -[[ArgParse]] +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[Logging]] +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" diff --git a/src/julia/JuliaStream.jl/Threaded/Project.toml b/src/julia/JuliaStream.jl/Threaded/Project.toml index b65bdf5..367e0ef 100644 --- a/src/julia/JuliaStream.jl/Threaded/Project.toml +++ b/src/julia/JuliaStream.jl/Threaded/Project.toml @@ -3,4 +3,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/oneAPI/Manifest.toml b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml index 649ea53..ed47c3a 100644 --- a/src/julia/JuliaStream.jl/oneAPI/Manifest.toml +++ b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml @@ -1,335 +1,441 @@ # This file is machine-generated - editing it directly is not advised -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" -uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "01f328e925b86927b3f24c30aee6ecdce5bd28cc" -[[ArgParse]] +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.6.2" +weakdeps = ["StaticArrays"] + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[Base64]] +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.2" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" - -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[Dates]] +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.10" +version = "0.21.4" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[LLVM]] +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "6.2.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.25+0" -[[LibCURL]] +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" -[[Logging]] + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[Markdown]] +[[deps.MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.11" + +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NEO_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c" +[[deps.NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "9846d87fd254cdaa1879dff93999e1bc32ed2658" uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" -version = "21.44.21506+0" +version = "23.17.26241+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenLibm_jll]] +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[SHA]] +[[deps.Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[deps.Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[SPIRV_LLVM_Translator_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" -uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" -version = "11.0.0+2" +[[deps.SPIRV_LLVM_Translator_unified_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "fe95f28a96975bd1d473e9273873b36402b79a54" +uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361" +version = "0.3.0+0" -[[SPIRV_Tools_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +[[deps.SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c" uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" -version = "2021.2.0+0" +version = "2023.2.0+0" -[[Serialization]] +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" + +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" -[[Statistics]] + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] + + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - -[[gmmlib_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" -uuid = "09858cae-167c-5acb-9302-fddc6874d481" -version = "21.2.1+0" - -[[libigc_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3" -uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" -version = "1.0.8744+0" - -[[nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[oneAPI]] -deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] -git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce" -uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" version = "0.2.1" -[[oneAPI_Level_Zero_Headers_jll]] +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[deps.gmmlib_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464" +git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "22.3.0+0" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7c0b5fa2ff90d96af106fd4a67ff6923cd3f9cb9" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.13822+0" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" + +[[deps.oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LinearAlgebra", "NEO_jll", "Preferences", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "UnsafeAtomicsLLVM", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"] +git-tree-sha1 = "9e6a675faf3ea27d08018c9bd0a03596003ff5cf" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "1.3.0" + +[[deps.oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f" uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" -version = "1.2.43+0" +version = "1.6.3+0" -[[oneAPI_Level_Zero_Loader_jll]] +[[deps.oneAPI_Level_Zero_Loader_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889" +git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638" uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" -version = "1.5.0+0" +version = "1.11.0+0" -[[p7zip_jll]] +[[deps.oneAPI_Support_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219" +uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36" +version = "0.2.2+0" + +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/oneAPI/Project.toml b/src/julia/JuliaStream.jl/oneAPI/Project.toml index 9f89f82..2a1b49d 100644 --- a/src/julia/JuliaStream.jl/oneAPI/Project.toml +++ b/src/julia/JuliaStream.jl/oneAPI/Project.toml @@ -4,4 +4,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/src/Stream.jl b/src/julia/JuliaStream.jl/src/Stream.jl index 755fe2b..97ba943 100644 --- a/src/julia/JuliaStream.jl/src/Stream.jl +++ b/src/julia/JuliaStream.jl/src/Stream.jl @@ -20,6 +20,18 @@ end @enum Benchmark All Triad Nstream + +function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C} + return @elapsed init_arrays!(data, context, init) +end + +function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C} + elapsed = @elapsed begin + result = read_data(data, context) + end + return (elapsed, result) +end + function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} timings = Timings(times) lastSum::T = 0 @@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where { end end -function run_nstream!( - data::StreamData{T,C}, - context, - times::Int, -)::Vector{Float64} where {T,C} +function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C} timings::Vector{Float64} = zeros(times) for i = 1:times @inbounds timings[i] = @elapsed nstream!(data, context) @@ -93,9 +101,7 @@ function check_solutions( error = abs((dot - gold_sum) / gold_sum) failed = error > 1.0e-8 if failed - println( - "Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum", - ) + println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum") end !failed end : true @@ -158,7 +164,7 @@ end const DefaultInit = (0.1, 0.2, 0.0) const DefaultScalar = 0.4 -const Version = "4.0" +const Version = "5.0" function main() @@ -166,7 +172,7 @@ function main() parse_options(config) if config.list - for (i, (_,repr, impl)) in enumerate(devices()) + for (i, (_, repr, impl)) in enumerate(devices()) println("[$i] ($impl) $repr") end exit(0) @@ -175,9 +181,7 @@ function main() ds = devices() # TODO implement substring device match if config.device < 1 || config.device > length(ds) - error( - "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", - ) + error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed") else device = ds[config.device] end @@ -220,10 +224,10 @@ function main() end function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) - tail = Base.rest(xs) - min = Iterators.minimum(tail) - max = Iterators.maximum(tail) - avg = Iterators.sum(tail) / Iterators.length(tail) + tail = Iterators.rest(xs) + min = Base.minimum(tail) + max = Base.maximum(tail) + avg = Base.sum(tail) / Base.length(tail) mbps = mega_scale * total_bytes / min if config.csv return [ @@ -257,16 +261,42 @@ function main() end end + function show_init(init::Float64, read::Float64) + setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)] + if config.csv + tabulate( + map( + x -> [ + ("phase", x[1]), + ("n_elements", config.arraysize), + ("sizeof", x[3]), + ("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]), + ("runtime", x[2]), + ], + setup, + )..., + ) + else + for (name, elapsed, total_bytes) in setup + println( + "$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)", + ) + end + end + end + init::Tuple{type,type,type} = DefaultInit scalar::type = DefaultScalar GC.enable(false) (data, context) = make_stream(config.arraysize, scalar, device, config.csv) - init_arrays!(data, context, init) + tInit = run_init_arrays!(data, context, init) if benchmark == All (timings, sum) = run_all!(data, context, config.numtimes) - valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, sum) tabulate( mk_row(timings.copy, "Copy", 2 * array_bytes), mk_row(timings.mul, "Mul", 2 * array_bytes), @@ -276,13 +306,15 @@ function main() ) elseif benchmark == Nstream timings = run_nstream!(data, context, config.numtimes) - valid = - check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, nothing) tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) elseif benchmark == Triad elapsed = run_triad!(data, context, config.numtimes) - valid = - check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, nothing) total_bytes = 3 * array_bytes * config.numtimes bandwidth = mega_scale * (total_bytes / elapsed) println("Runtime (seconds): $(round(elapsed; digits=5))") @@ -290,7 +322,6 @@ function main() else error("Bad benchmark $(benchmark)") end - GC.enable(true) if !valid diff --git a/src/julia/JuliaStream.jl/update_all.sh b/src/julia/JuliaStream.jl/update_all.sh index ad6c2ee..648b481 100755 --- a/src/julia/JuliaStream.jl/update_all.sh +++ b/src/julia/JuliaStream.jl/update_all.sh @@ -3,5 +3,6 @@ for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" do - julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' -done \ No newline at end of file + echo "Updating subproject $BACKEND" + julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' +done diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 00efe92..66b9662 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this @@ -14,9 +14,9 @@ KokkosStream::KokkosStream( { Kokkos::initialize(); - d_a = new Kokkos::View("d_a", ARRAY_SIZE); - d_b = new Kokkos::View("d_b", ARRAY_SIZE); - d_c = new Kokkos::View("d_c", ARRAY_SIZE); + d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE); + d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE); + d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE); hm_a = new typename Kokkos::View::HostMirror(); hm_b = new typename Kokkos::View::HostMirror(); hm_c = new typename Kokkos::View::HostMirror(); @@ -140,7 +140,7 @@ T KokkosStream::dot() Kokkos::View a(*d_a); Kokkos::View b(*d_b); - T sum = 0.0; + T sum{}; Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp) { diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 3aa7cf5..a410a86 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,9 +10,6 @@ #include #include -#include -#include - #include "Stream.h" #define IMPLEMENTATION_STRING "Kokkos" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 445991d..7457eeb 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,32 +1,38 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") -register_flag_required(KOKKOS_IN_TREE +register_flag_optional(KOKKOS_IN_TREE "Absolute path to the *source* distribution directory of Kokkos. Remember to append Kokkos specific flags as well, for example: - -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... + See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") - See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") +register_flag_optional(KOKKOS_IN_PACKAGE + "Absolute path to package R-Path containing Kokkos libs. + Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") if (EXISTS "${KOKKOS_IN_TREE}") + message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) - else () - message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") + elseif (EXISTS "${KOKKOS_IN_PACKAGE}") + message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") + find_package(Kokkos REQUIRED) + register_link_library(Kokkos::kokkos) + else() + message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") endif () register_append_compiler_and_arch_specific_cxx_flags( @@ -36,5 +42,3 @@ macro(setup) ) endmacro() - - diff --git a/src/main.cpp b/src/main.cpp index 3035da0..abfc14e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "4.0" +#define VERSION_STRING "5.0" #include "Stream.h" @@ -49,6 +49,8 @@ #include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" +#elif defined(FUTHARK) +#include "FutharkStream.h" #endif // Default size of 2^25 @@ -222,10 +224,10 @@ void run() { // MiB = 2^20 std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" + << " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; } else { @@ -298,12 +300,18 @@ void run() // Use the OpenMP implementation stream = new OMPStream(ARRAY_SIZE, deviceIndex); +#elif defined(FUTHARK) + // Use the Futhark implementation + stream = new FutharkStream(ARRAY_SIZE, deviceIndex); + #endif + auto init1 = std::chrono::high_resolution_clock::now(); stream->init_arrays(startA, startB, startC); + auto init2 = std::chrono::high_resolution_clock::now(); // Result of the Dot kernel, if used. - T sum = 0.0; + T sum{}; std::vector> timings; @@ -327,7 +335,54 @@ void run() std::vector c(ARRAY_SIZE); + auto read1 = std::chrono::high_resolution_clock::now(); stream->read_arrays(a, b, c); + auto read2 = std::chrono::high_resolution_clock::now(); + + auto initElapsedS = std::chrono::duration_cast>(read2 - read1).count(); + auto readElapsedS = std::chrono::duration_cast>(init2 - init1).count(); + auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS; + auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS; + + if (output_as_csv) + { + std::cout + << "phase" << csv_separator + << "n_elements" << csv_separator + << "sizeof" << csv_separator + << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator + << "runtime" << std::endl; + std::cout + << "Init" << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << initBWps << csv_separator + << initElapsedS << std::endl; + std::cout + << "Read" << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << readBWps << csv_separator + << readElapsedS << std::endl; + } + else + { + std::cout << "Init: " + << std::setw(7) + << initElapsedS + << " s (=" + << initBWps + << (mibibytes ? " MiBytes/sec" : " MBytes/sec") + << ")" << std::endl; + std::cout << "Read: " + << std::setw(7) + << readElapsedS + << " s (=" + << readBWps + << (mibibytes ? " MiBytes/sec" : " MBytes/sec") + << ")" << std::endl; + } + check_solution(num_times, a, b, c, sum); // Display timing results @@ -393,7 +448,7 @@ void run() << num_times << csv_separator << ARRAY_SIZE << csv_separator << sizeof(T) << csv_separator - << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator + << ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator << *minmax.first << csv_separator << *minmax.second << csv_separator << average @@ -404,7 +459,7 @@ void run() std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12) << std::setprecision(3) << - ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) + ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << std::left << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12) << std::setprecision(5) << *minmax.second << std::left << std::setw(12) << std::setprecision(5) << average @@ -415,7 +470,7 @@ void run() { // Display timing results double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; - double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); + double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); if (output_as_csv) { @@ -461,7 +516,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector T goldA = startA; T goldB = startB; T goldC = startC; - T goldSum = 0.0; + T goldSum{}; const T scalar = startScalar; @@ -487,15 +542,15 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error - double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); }); errA /= a.size(); - double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); }); errB /= b.size(); - double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); }); errC /= c.size(); - double errSum = fabs((sum - goldSum)/goldSum); + long double errSum = std::fabs((sum - goldSum)/goldSum); - double epsi = std::numeric_limits::epsilon() * 100.0; + long double epsi = std::numeric_limits::epsilon() * 100.0; if (errA > epsi) std::cerr diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp index be88ba9..26b525a 100644 --- a/src/ocl/OCLStream.cpp +++ b/src/ocl/OCLStream.cpp @@ -260,7 +260,7 @@ T OCLStream::dot() ); cl::copy(queue, d_sum, sums.begin(), sums.end()); - T sum = 0.0; + T sum{}; for (T val : sums) sum += val; diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 0cd8035..774f61b 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -220,7 +220,7 @@ void OMPStream::nstream() template T OMPStream::dot() { - T sum = 0.0; + T sum{}; #ifdef OMP_TARGET_GPU int array_size = this->array_size; diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp index d271ea4..6a99999 100644 --- a/src/raja/RAJAStream.cpp +++ b/src/raja/RAJAStream.cpp @@ -131,7 +131,7 @@ T RAJAStream::dot() T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; - RAJA::ReduceSum sum(0.0); + RAJA::ReduceSum sum(T{}); forall(range, [=] RAJA_DEVICE (RAJA::Index_type index) { diff --git a/src/raja/model.cmake b/src/raja/model.cmake index 4da4af6..bf30631 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -1,25 +1,26 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" "c++") -register_flag_required(RAJA_IN_TREE +register_flag_optional(RAJA_IN_TREE "Absolute path to the *source* distribution directory of RAJA. Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: - -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options -") +" "") + +register_flag_optional(RAJA_IN_PACKAGE + "Use if Raja is part of a package dependency: + Path to installation" "") register_flag_optional(TARGET "Target offload device, implemented values are CPU, NVIDIA" CPU) register_flag_optional(CUDA_TOOLKIT_ROOT_DIR - "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "") + "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the RAJA_ENABLE_CUDA or ENABLE_CUDA flag is specified for RAJA" "") # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes register_flag_optional(CUDA_ARCH @@ -57,7 +58,20 @@ macro(setup) set(ENABLE_BENCHMARKS OFF CACHE BOOL "") set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE) - if (ENABLE_CUDA) + # RAJA >= v2022.03.0 switched to prefixed variables, we keep the legacy ones for backwards compatibiity + set(RAJA_ENABLE_TESTS OFF CACHE BOOL "") + set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "") + set(RAJA_ENABLE_REPRODUCERS OFF CACHE BOOL "") + set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "") + set(RAJA_ENABLE_DOCUMENTATION OFF CACHE BOOL "") + set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "") + set(RAJA_ENABLE_CUDA ${RAJA_ENABLE_CUDA} CACHE BOOL "" FORCE) + + if (ENABLE_CUDA OR RAJA_ENABLE_CUDA) + + # RAJA still needs ENABLE_CUDA for internal use, so if either is on, assert both. + set(RAJA_ENABLE_CUDA ON) + set(ENABLE_CUDA ON) # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes if(POLICY CMP0104) @@ -69,6 +83,10 @@ macro(setup) set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) list(APPEND CMAKE_CUDA_FLAGS) + # See https://github.com/LLNL/RAJA/pull/1302 + # And https://github.com/LLNL/RAJA/pull/1339 + set(RAJA_ENABLE_VECTORIZATION OFF CACHE BOOL "") + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}") endif () @@ -76,8 +94,14 @@ macro(setup) register_link_library(RAJA) # RAJA's cmake screws with where the binary will end up, resetting it here: set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + + elseif (EXISTS "${RAJA_IN_PACKAGE}") + message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`") + find_package(RAJA REQUIRED) + register_link_library(RAJA) + else () - message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") + message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists") endif () diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index cb86dab..20be287 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -4,29 +4,147 @@ version = 3 [[package]] name = "ansi_term" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix", + "slab", + "socket2", + "waker-fn", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-task" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" + +[[package]] +name = "atomic-waker" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" + [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi 0.3.9", ] [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -34,6 +152,36 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blocking" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "log", +] + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -42,9 +190,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "2.33.3" +version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", @@ -64,6 +212,15 @@ dependencies = [ "crossterm", ] +[[package]] +name = "concurrent-queue" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "core_affinity" version = "0.5.10" @@ -78,9 +235,9 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ "cfg-if", "crossbeam-channel", @@ -92,9 +249,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -102,9 +259,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -113,22 +270,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.5" +version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ + "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.2" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ "cfg-if", "crossbeam-utils", @@ -136,12 +293,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.8" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", - "lazy_static", ] [[package]] @@ -171,9 +327,167 @@ dependencies = [ [[package]] name = "either" -version = "1.6.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "errno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.37", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gloo-timers" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] [[package]] name = "heck" @@ -193,6 +507,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + [[package]] name = "instant" version = "0.1.12" @@ -202,6 +522,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.3", + "libc", + "windows-sys", +] + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -212,6 +552,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -220,33 +569,46 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.108" +version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" -version = "0.4.5" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" dependencies = [ - "cfg-if", + "value-bag", ] [[package]] -name = "memoffset" -version = "0.6.4" +name = "memchr" +version = "2.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -275,32 +637,44 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" dependencies = [ "winapi 0.3.9", ] [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.13.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.3", "libc", ] +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "parking_lot" version = "0.11.2" @@ -314,9 +688,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" dependencies = [ "cfg-if", "instant", @@ -327,12 +701,31 @@ dependencies = [ ] [[package]] -name = "pest" -version = "2.1.3" +name = "pin-project-lite" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ - "ucd-trie", + "autocfg", + "bitflags", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys", ] [[package]] @@ -344,7 +737,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "version_check", ] @@ -361,67 +754,75 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.32" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.10" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.5.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ - "autocfg", - "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", - "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "rstest" -version = "0.10.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc" +dependencies = [ + "async-std", + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116" dependencies = [ "cfg-if", "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.109", ] [[package]] @@ -443,42 +844,44 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] -name = "rustversion" -version = "1.0.5" +name = "rustix" +version = "0.37.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" +checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "0.11.0" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] +checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0" [[package]] name = "signal-hook" @@ -493,18 +896,37 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] [[package]] -name = "smallvec" -version = "1.7.0" +name = "slab" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi 0.3.9", +] [[package]] name = "strsim" @@ -514,9 +936,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ "clap", "lazy_static", @@ -533,25 +955,36 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "syn" -version = "1.0.82" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] name = "tabular" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1" dependencies = [ "unicode-width", ] @@ -566,28 +999,28 @@ dependencies = [ ] [[package]] -name = "ucd-trie" -version = "0.1.3" +name = "unicode-ident" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-segmentation" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] -name = "unicode-xid" -version = "0.2.2" +name = "value-bag" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" [[package]] name = "vec_map" @@ -597,9 +1030,91 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.37", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.37", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "web-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +dependencies = [ + "js-sys", + "wasm-bindgen", +] [[package]] name = "winapi" @@ -634,3 +1149,69 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml index 8ac456f..2478518 100644 --- a/src/rust/rust-stream/Cargo.toml +++ b/src/rust/rust-stream/Cargo.toml @@ -1,25 +1,25 @@ [package] name = "rust-stream" -version = "4.0.0" +version = "5.0.0" authors = ["Wei-Chen Lin "] edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -num-traits = "0.2.14" -structopt = "0.3.13" -tabular = "0.1.4" -rayon = "1.5.1" -crossbeam = "0.8.1" -num_cpus = "1.13.0" -rustversion = "1.0" -libc = "0.2.97" +num-traits = "0.2.15" +structopt = "0.3.26" +tabular = "0.2.0" +rayon = "1.5.3" +crossbeam = "0.8.2" +num_cpus = "1.13.1" +rustversion = "1.0.9" +libc = "0.2.134" core_affinity = "0.5.10" colour = "0.6.0" [dev-dependencies] -rstest = "0.10.0" +rstest = "0.13.0" [build-dependencies] rustversion = "1.0" diff --git a/src/rust/rust-stream/rustfmt.toml b/src/rust/rust-stream/rustfmt.toml index aa2f0e9..66b6235 100644 --- a/src/rust/rust-stream/rustfmt.toml +++ b/src/rust/rust-stream/rustfmt.toml @@ -54,7 +54,7 @@ use_field_init_shorthand = false force_explicit_abi = true condense_wildcard_suffixes = false color = "Auto" -required_version = "1.4.38" +required_version = "1.6.0" unstable_features = false disable_all_formatting = false skip_children = false diff --git a/src/rust/rust-stream/src/lib.rs b/src/rust/rust-stream/src/lib.rs index 3ac72c3..41ac0c2 100644 --- a/src/rust/rust-stream/src/lib.rs +++ b/src/rust/rust-stream/src/lib.rs @@ -174,7 +174,7 @@ where StreamData: RustStream { ); } - stream.init_arrays(); + let init = stream.run_init_arrays(); let tabulate = |xs: &Vec, name: &str, t_size: usize| -> Vec<(&str, String)> { let tail = &xs[1..]; // tail only @@ -235,10 +235,47 @@ where StreamData: RustStream { }; }; + let show_setup = |init: Duration, read: Duration| { + let setup = vec![ + ("Init", init.as_secs_f64(), 3 * array_bytes), + ("Read", read.as_secs_f64(), 3 * array_bytes), + ]; + if option.csv { + tabulate_all( + setup + .iter() + .map(|(name, elapsed, t_size)| { + vec![ + ("phase", name.to_string()), + ("n_elements", option.arraysize.to_string()), + ("sizeof", t_size.to_string()), + ( + if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" }, + (mega_scale * (*t_size as f64) / elapsed).to_string(), + ), + ("runtime", elapsed.to_string()), + ] + }) + .collect::>(), + ); + } else { + for (name, elapsed, t_size) in setup { + println!( + "{}: {:.5} s (={:.5} {})", + name, + elapsed, + mega_scale * (t_size as f64) / elapsed, + if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" } + ); + } + } + }; + let solutions_correct = match benchmark { Benchmark::All => { let (results, sum) = stream.run_all(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum)); tabulate_all(vec![ tabulate(&results.copy, "Copy", 2 * array_bytes), @@ -251,14 +288,16 @@ where StreamData: RustStream { } Benchmark::NStream => { let results = stream.run_nstream(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, None); tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]); correct } Benchmark::Triad => { let results = stream.run_triad(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, None); let total_bytes = 3 * array_bytes * option.numtimes; let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64()); diff --git a/src/rust/rust-stream/src/stream.rs b/src/rust/rust-stream/src/stream.rs index 560c6f1..86de56b 100644 --- a/src/rust/rust-stream/src/stream.rs +++ b/src/rust/rust-stream/src/stream.rs @@ -132,6 +132,18 @@ pub trait RustStream { fn nstream(&mut self); fn dot(&mut self) -> T; + fn run_init_arrays(&mut self) -> Duration { + timed(|| { + self.init_arrays(); + }) + } + + fn run_read_arrays(&mut self) -> Duration { + timed(|| { + self.read_arrays(); + }) + } + fn run_all(&mut self, n: usize) -> (AllTiming>, T) { let mut timings: AllTiming> = AllTiming { copy: vec![Duration::default(); n], diff --git a/src/rust/rust-stream/tests/integration_test.rs b/src/rust/rust-stream/tests/integration_test.rs index 8031a79..0170546 100644 --- a/src/rust/rust-stream/tests/integration_test.rs +++ b/src/rust/rust-stream/tests/integration_test.rs @@ -2,10 +2,10 @@ use rstest::rstest; #[rstest] fn test_main( - #[values(0, 1, 2, 3, 4)] device: usize, // - #[values("", "--pin")] pin: &str, // - #[values("", "--malloc")] malloc: &str, // - #[values("", "--init")] init: &str, // + #[values(0, 1, 2, 3, 4)] device: usize, // + #[values("", "--pin")] pin: &str, // + #[values("", "--malloc")] malloc: &str, // + #[values("", "--init")] init: &str, // #[values("", "--triad-only", "--nstream-only")] option: &str, // ) { let line = format!( diff --git a/src/scala/scala-stream/.bsp/sbt.json b/src/scala/scala-stream/.bsp/sbt.json deleted file mode 100644 index 2e1edb1..0000000 --- a/src/scala/scala-stream/.bsp/sbt.json +++ /dev/null @@ -1 +0,0 @@ -{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]} \ No newline at end of file diff --git a/src/scala/scala-stream/.gitignore b/src/scala/scala-stream/.gitignore index 2f7896d..ee5cda2 100644 --- a/src/scala/scala-stream/.gitignore +++ b/src/scala/scala-stream/.gitignore @@ -1 +1,2 @@ target/ +.bsp/ diff --git a/src/scala/scala-stream/.scalafmt.conf b/src/scala/scala-stream/.scalafmt.conf index 8c7d0c8..5d87df3 100644 --- a/src/scala/scala-stream/.scalafmt.conf +++ b/src/scala/scala-stream/.scalafmt.conf @@ -1,4 +1,4 @@ -version = "3.0.0-RC2" +version = "3.7.14" runner.dialect = scala3 style = defaultWithAlign diff --git a/src/scala/scala-stream/build.sbt b/src/scala/scala-stream/build.sbt index 49164f6..2513b53 100644 --- a/src/scala/scala-stream/build.sbt +++ b/src/scala/scala-stream/build.sbt @@ -3,14 +3,19 @@ lazy val mainCls = Some("scalastream.App") lazy val root = (project in file(".")) .enablePlugins(NativeImagePlugin) .settings( - scalaVersion := "3.0.0", - version := "4.0", + scalaVersion := "3.3.1", + version := "5.0", organization := "uk.ac.bristol.uob-hpc", organizationName := "University of Bristol", Compile / mainClass := mainCls, assembly / mainClass := mainCls, scalacOptions ~= filterConsoleScalacOptions, assembly / assemblyJarName := "scala-stream.jar", + assembly / assemblyMergeStrategy := { + case PathList("module-info.class") => MergeStrategy.discard + case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard + case x => (ThisBuild / assemblyMergeStrategy).value(x) + }, nativeImageOptions := Seq( "--no-fallback", "-H:ReflectionConfigurationFiles=../../reflect-config.json" @@ -22,8 +27,8 @@ lazy val root = (project in file(".")) // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), // par also uses lazy val at some point, so it doesn't work in nativeImage - "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3", - "net.openhft" % "affinity" % "3.21ea1", - "org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity + "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4", + "net.openhft" % "affinity" % "3.23.2", + "org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity ) ) diff --git a/src/scala/scala-stream/project/build.properties b/src/scala/scala-stream/project/build.properties index 19479ba..875b706 100644 --- a/src/scala/scala-stream/project/build.properties +++ b/src/scala/scala-stream/project/build.properties @@ -1 +1 @@ -sbt.version=1.5.2 +sbt.version=1.9.2 diff --git a/src/scala/scala-stream/project/plugins.sbt b/src/scala/scala-stream/project/plugins.sbt index 2c82902..35a00f0 100644 --- a/src/scala/scala-stream/project/plugins.sbt +++ b/src/scala/scala-stream/project/plugins.sbt @@ -1,6 +1,6 @@ addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") -addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20") addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala index 9c011a6..8f247b6 100644 --- a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala +++ b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala @@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: def config: Config[A] def initArrays(): Unit + def readArrays(): Unit = () def copy(): Unit def mul(): Unit def add(): Unit @@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: val end = System.nanoTime() FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r + inline def runInitArrays(): FiniteDuration = timed(initArrays())._1 + inline def runReadArrays(): FiniteDuration = timed(readArrays())._1 inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) = val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) @@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: def data(): Data[A] - trait Fractional[@specialized(Double, Float) A]: def toFractional(f: Float): A def toFractional(f: Double): A @@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]: extension (x: Int) inline def fractional = toFractional(x.toFloat) extension (x: Long) inline def fractional = toFractional(x.toDouble) extension (x: A) - inline def +(y: A) = add(x, y) - inline def -(y: A) = sub(x, y) - inline def *(y: A) = mul(x, y) - inline def /(y: A) = div(x, y) - inline def >(y: A) = compare(x, y) > 0 - inline def <(y: A) = compare(x, y) < 0 - inline def abs_ = abs(x) + inline def +(y: A) = add(x, y) + inline def -(y: A) = sub(x, y) + inline def *(y: A) = mul(x, y) + inline def /(y: A) = div(x, y) + inline def >(y: A) = compare(x, y) > 0 + inline def <(y: A) = compare(x, y) < 0 + inline def abs_ = abs(x) end Fractional given FloatFractional: Fractional[Float] with @@ -108,7 +110,7 @@ given DoubleFractional: Fractional[Double] with object App: - final val Version: String = "4.0" + final val Version: String = "5.0" case class Config[@specialized(Double, Float) A]( options: Options, @@ -204,7 +206,7 @@ object App: validateXs("c", vec.c, goldC) dotSum.foreach { sum => - val goldSum = (goldA * goldB) * (config.options.arraysize).fractional + val goldSum = (goldA * goldB) * config.options.arraysize.fractional val error = ((sum - goldSum) / goldSum).abs_ if error > 1.fractional / 100000000.fractional then Console.err.println( @@ -238,10 +240,10 @@ object App: ) println(s"Running ${config.benchmark match { - case Benchmark.All => "kernels" - case Benchmark.Triad => "triad" - case Benchmark.NStream => "nstream" - }} ${opt.numtimes} times") + case Benchmark.All => "kernels" + case Benchmark.Triad => "triad" + case Benchmark.NStream => "nstream" + }} ${opt.numtimes} times") if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}") @@ -288,11 +290,38 @@ object App: println(header.map(_._1.padTo(padding, ' ')).mkString(sep)) println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n")) + def showInit(init: FiniteDuration, read: FiniteDuration): Unit = { + val setup = + Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes)) + if opt.csv then + tabulate( + setup.map((name, elapsed, totalBytes) => + Vector( + "phase" -> name, + "n_elements" -> opt.arraysize.toString, + "sizeof" -> arrayBytes.toString, + s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" -> + (megaScale * totalBytes.toDouble / elapsed).toString, + "runtime" -> elapsed.toString + ) + ): _* + ) + else + for (name, elapsed, totalBytes) <- setup do + println( + f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${ + if opt.mibibytes then "i" else "" + }Bytes/sec)" + ) + } + val stream = mkStream(config) - stream.initArrays() + val init = stream.runInitArrays() config.benchmark match case Benchmark.All => val (results, sum) = stream.runAll(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) validate(stream.data(), config, Some(sum)) tabulate( mkRow(results.copy, "Copy", 2 * arrayBytes), @@ -303,10 +332,14 @@ object App: ) case Benchmark.NStream => val result = stream.runNStream(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) validate(stream.data(), config) tabulate(mkRow(result, "Nstream", 4 * arrayBytes)) case Benchmark.Triad => - val results = stream.runTriad(opt.numtimes) + val results = stream.runTriad(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) val totalBytes = 3 * arrayBytes * opt.numtimes val bandwidth = megaScale * (totalBytes / results.seconds) println(f"Runtime (seconds): ${results.seconds}%.5f") diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 343e247..a234d61 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -6,64 +6,76 @@ #include "STDDataStream.h" -#include -#include -#include - -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - - template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) + noexcept : array_size{ARRAY_SIZE}, + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif +} + +template +STDDataStream::~STDDataStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, a, a + array_size, c); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus()); + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -73,8 +85,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -82,7 +94,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); } void listDevices(void) @@ -101,4 +113,3 @@ std::string getDeviceDriver(const int) } template class STDDataStream; template class STDDataStream; - diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 741fd6c..65e1ace 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -5,6 +5,7 @@ // source code #pragma once +#include "dpl_shim.h" #include #include @@ -21,14 +22,11 @@ class STDDataStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - + T *a, *b, *c; public: STDDataStream(const int, int) noexcept; - ~STDDataStream() = default; + ~STDDataStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index ef69f30..e9e7099 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + DPCPP - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 2221f90..fc9f380 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -6,50 +6,66 @@ #include "STDIndicesStream.h" -#include -#include -#include - -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) +noexcept : array_size{ARRAY_SIZE}, range(0, array_size), + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif +} + +template +STDIndicesStream::~STDIndicesStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, a, a + array_size, c); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -58,7 +74,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); } @@ -67,7 +83,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -79,7 +95,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -89,7 +105,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); } void listDevices(void) @@ -108,4 +124,3 @@ std::string getDeviceDriver(const int) } template class STDIndicesStream; template class STDIndicesStream; - diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index bc068aa..ffab910 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -5,6 +5,7 @@ // source code #pragma once +#include "dpl_shim.h" #include #include @@ -12,40 +13,57 @@ #define IMPLEMENTATION_STRING "STD (index-oriented)" - // A lightweight counting iterator which will be used by the STL algorithms // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // implementation doesn't target template class ranged { - N from, to; public: - ranged(N from, N to ): from(from), to(to) {} - class iterator { - N num; + class iterator { + friend class ranged; public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = const N&; - using iterator_category = std::random_access_iterator_tag; - explicit iterator(N _num = 0) : num(_num) {} + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = N; + using iterator_category = std::random_access_iterator_tag; - iterator& operator++() { num++; return *this; } - iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } - iterator operator+(const value_type v) const { return iterator(num + v); } + // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. + // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. + // This type is unused for any nother STL impl. + using is_passed_directly = std::true_type; - bool operator==(iterator other) const { return num == other.num; } - bool operator!=(iterator other) const { return *this != other; } - bool operator<(iterator other) const { return num < other.num; } + reference operator *() const { return i_; } + iterator &operator ++() { ++i_; return *this; } + iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - reference operator*() const { return num;} - difference_type operator-(const iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num + i; } + iterator &operator --() { --i_; return *this; } + iterator operator --(int) { iterator copy(*this); --i_; return copy; } - }; - iterator begin() { return iterator(from); } - iterator end() { return iterator(to >= from? to+1 : to-1); } + iterator &operator +=(N by) { i_+=by; return *this; } + + value_type operator[](const difference_type &i) const { return i_ + i; } + + difference_type operator-(const iterator &it) const { return i_ - it.i_; } + iterator operator+(const value_type v) const { return iterator(i_ + v); } + + bool operator ==(const iterator &other) const { return i_ == other.i_; } + bool operator !=(const iterator &other) const { return i_ != other.i_; } + bool operator < (const iterator &other) const { return i_ < other.i_; } + + protected: + explicit iterator(N start) : i_ (start) {} + + private: + N i_; + }; + + [[nodiscard]] iterator begin() const { return begin_; } + [[nodiscard]] iterator end() const { return end_; } + ranged(N begin, N end) : begin_(begin), end_(end) {} +private: + iterator begin_; + iterator end_; }; template @@ -59,14 +77,11 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - + T *a, *b, *c; public: STDIndicesStream(const int, int) noexcept; - ~STDIndicesStream() = default; + ~STDIndicesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index ef69f30..60ef575 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -19,15 +19,35 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + DPCPP - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index de61528..b29d0c4 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -5,25 +5,45 @@ // source code #include "STDRangesStream.hpp" - -#include -#include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} +noexcept : array_size{ARRAY_SIZE}, + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { - a = std::vector(array_size); - b = std::vector(array_size); - c = std::vector(array_size); + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif +} + +template +STDRangesStream::~STDRangesStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, // loop range [&] (int i) { a[i] = initA; @@ -37,16 +57,16 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDRangesStream::copy() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i]; @@ -60,7 +80,7 @@ void STDRangesStream::mul() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { b[i] = scalar * c[i]; @@ -72,7 +92,7 @@ template void STDRangesStream::add() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i] + b[i]; @@ -86,7 +106,7 @@ void STDRangesStream::triad() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] = b[i] + scalar * c[i]; @@ -100,7 +120,7 @@ void STDRangesStream::nstream() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] += b[i] + scalar * c[i]; @@ -114,8 +134,8 @@ T STDRangesStream::dot() // sum += a[i] * b[i]; return std::transform_reduce( - std::execution::par_unseq, - a.begin(), a.end(), b.begin(), 0.0); + exe_policy, + a, a + array_size, b, T{}); } void listDevices(void) @@ -135,4 +155,3 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; - diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 890e893..6e7c29c 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -5,10 +5,10 @@ // source code #pragma once +#include "dpl_shim.h" #include -#include - +#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD C++ ranges" @@ -21,13 +21,11 @@ class STDRangesStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; + T *a, *b, *c; public: - STDRangesStream(const int, int); - ~STDRangesStream() = default; + STDRangesStream(const int, int) noexcept; + ~STDRangesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index fd07387..8f73501 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,6 +3,22 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + DPCPP - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -12,5 +28,19 @@ macro(setup) set(CMAKE_CXX_STANDARD_REQUIRED OFF) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: - register_append_cxx_flags(ANY -std=c++2a) + register_append_cxx_flags(ANY -std=c++20) + if (USE_TBB) + register_link_library(TBB::tbb) + endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () +endmacro() + +macro(setup_target NAME) + if (USE_ONEDPL) + target_compile_features(${NAME} INTERFACE cxx_std_20) + target_compile_features(oneDPL INTERFACE cxx_std_20) + endif () endmacro() diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 00c043f..512517b 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -191,7 +191,7 @@ T SYCLStream::dot() size_t li = item.get_local_id(0); size_t global_size = item.get_global_range()[0]; - wg_sum[li] = 0.0; + wg_sum[li] = {}; for (; i < N; i += global_size) wg_sum[li] += ka[i] * kb[i]; @@ -208,7 +208,7 @@ T SYCLStream::dot() }); }); - T sum = 0.0; + T sum{}; auto h_sum = d_sum->template get_access(); for (int i = 0; i < dot_num_groups; i++) { diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake index e7b5a1c..6a517c1 100644 --- a/src/sycl/model.cmake +++ b/src/sycl/model.cmake @@ -6,14 +6,16 @@ register_flag_optional(CMAKE_CXX_COMPILER register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are - ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -47,7 +49,8 @@ macro(setup) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) - setup_opencl_header_includes() + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) # ComputeCpp needs OpenCL @@ -59,12 +62,18 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_definitions(CL_TARGET_OPENCL_VERSION=220) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") - set(CMAKE_CXX_COMPILER dpcpp) - register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () diff --git a/src/sycl2020/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp similarity index 93% rename from src/sycl2020/SYCLStream2020.cpp rename to src/sycl2020-acc/SYCLStream2020.cpp index 17a5ab5..0de24bb 100644 --- a/src/sycl2020/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -164,8 +164,13 @@ T SYCLStream::dot() sycl::accessor kb {d_b, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity{}), + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), +#else + sycl::reduction(d_sum, cgh, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), +#endif [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h similarity index 95% rename from src/sycl2020/SYCLStream2020.h rename to src/sycl2020-acc/SYCLStream2020.h index 7481d16..caaeae9 100644 --- a/src/sycl2020/SYCLStream2020.h +++ b/src/sycl2020-acc/SYCLStream2020.h @@ -14,7 +14,7 @@ #include -#define IMPLEMENTATION_STRING "SYCL 2020" +#define IMPLEMENTATION_STRING "SYCL2020 accessors" template class SYCLStream : public Stream diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake new file mode 100644 index 0000000..0cd8c92 --- /dev/null +++ b/src/sycl2020-acc/model.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" + "c++") + +register_flag_required(SYCL_COMPILER + "Compile using the specified SYCL compiler implementation + Supported values are + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) + DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") + +register_flag_optional(SYCL_COMPILER_DIR + "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. + HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + "") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + + if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + + + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + + if (NOT EXISTS "${hipSYCL_DIR}") + message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${hipSYCL_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + endif () + + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(hipSYCL CONFIG REQUIRED) + message(STATUS "ok") + + elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") + + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) + + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") + + register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) + # ComputeCpp needs OpenCL + find_package(ComputeCpp REQUIRED) + + # this must come after FindComputeCpp (!) + set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) + + elseif (${SYCL_COMPILER} STREQUAL "DPCPP") + set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + else () + message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") + endif () + +endmacro() + + +macro(setup_target NAME) + if ( + (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR + (${SYCL_COMPILER} STREQUAL "HIPSYCL")) + # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + endif () +endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp new file mode 100644 index 0000000..21a8a47 --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -0,0 +1,269 @@ + +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream2020.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + a = sycl::malloc_shared(array_size, *queue); + b = sycl::malloc_shared(array_size, *queue); + c = sycl::malloc_shared(array_size, *queue); + sum = sycl::malloc_shared(1, *queue); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + +template +SYCLStream::~SYCLStream() { + sycl::free(a, *queue); + sycl::free(b, *queue); + sycl::free(c, *queue); + sycl::free(sum, *queue); +} + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx) + { + c[idx] = a[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx) + { + b[idx] = scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx) + { + c[idx] = a[idx] + b[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] += b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(sum, sycl::plus()), +#else + sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), +#endif + [a = this->a, b = this->b](sycl::id<1> idx, auto& sum) + { + sum += a[idx] * b[idx]; + }); + + }); + queue->wait(); + return *sum; +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = initA; + b[idx] = initB; + c[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h new file mode 100644 index 0000000..0b2dc0d --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL2020 USM" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + T *a{}; + T *b{}; + T *c{}; + T *sum{}; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/src/sycl2020/model.cmake b/src/sycl2020-usm/model.cmake similarity index 74% rename from src/sycl2020/model.cmake rename to src/sycl2020-usm/model.cmake index e7b5a1c..81ad9d7 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020-usm/model.cmake @@ -6,20 +6,19 @@ register_flag_optional(CMAKE_CXX_COMPILER register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are - ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") -register_flag_optional(OpenCL_LIBRARY - "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" - "${OpenCL_LIBRARY}") macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -47,7 +46,8 @@ macro(setup) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) - setup_opencl_header_includes() + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) # ComputeCpp needs OpenCL @@ -59,12 +59,18 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_definitions(CL_TARGET_OPENCL_VERSION=220) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") - set(CMAKE_CXX_COMPILER dpcpp) - register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 9c34a50..c5e9d90 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -5,15 +5,37 @@ // source code #include "TBBStream.hpp" +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif { if(device != 0){ throw std::runtime_error("Device != 0 is not supported by TBB"); } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; } @@ -35,9 +57,9 @@ template void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template @@ -132,3 +154,5 @@ std::string getDeviceDriver(const int) template class TBBStream; template class TBBStream; +#undef BEGIN +#undef END diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 90763a9..2744afc 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -40,10 +40,15 @@ class TBBStream : public Stream tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - +#ifdef USE_VECTOR + std::vector a, b, c; +#else + size_t array_size; + T *a, *b, *c; +#endif + + + public: TBBStream(const int, int); ~TBBStream() = default; diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index e4d6bac..1cbd7fb 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -1,7 +1,7 @@ register_flag_optional(ONE_TBB_DIR "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. - If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") @@ -15,15 +15,28 @@ register_flag_optional(PARTITIONER See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." "AUTO") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct endif() - + if (NOT USE_TBB) + # Only find TBB when we're not building in-tree + find_package(TBB REQUIRED) + endif() # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages - find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index f87ace7..a2a4b72 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -8,7 +8,11 @@ #include #include +#if defined(MANAGED) +#include +#else #include +#endif #include "Stream.h" @@ -21,9 +25,15 @@ class ThrustStream : public Stream // Size of arrays int array_size; + #if defined(MANAGED) + thrust::universtal_vector a; + thrust::universtal_vector b; + thrust::universtal_vector c; + #else thrust::device_vector a; thrust::device_vector b; thrust::device_vector c; + #endif public: ThrustStream(const int, int); diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 0c286c2..6b82ef5 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -18,6 +18,9 @@ register_flag_optional(BACKEND " "CUDA") + register_flag_optional(MANAGED "Enabled managed memory mode." + "OFF") + register_flag_optional(CMAKE_CUDA_COMPILER "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" "") @@ -34,17 +37,21 @@ register_flag_optional(CUDA_EXTRA_FLAGS macro(setup) set(CMAKE_CXX_STANDARD 14) + if (MANAGED) + register_definitions(MANAGED) + endif () if (${THRUST_IMPL} STREQUAL "CUDA") # see CUDA.cmake, we're only adding a few Thrust related libraries here if (POLICY CMP0104) - cmake_policy(SET CMP0104 OLD) + cmake_policy(SET CMP0104 NEW) endif () + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS}) enable_language(CUDA) # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later @@ -53,7 +60,11 @@ macro(setup) message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake` + # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` + # same thing for thrust if (SDK_DIR) + list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR}) find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) else () @@ -64,9 +75,11 @@ macro(setup) message(STATUS "Using Thrust backend: ${BACKEND}") # this creates the interface that we can link to - thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + thrust_create_target(Thrust${BACKEND} + HOST CPP + DEVICE ${BACKEND}) - register_link_library(Thrust) + register_link_library(Thrust${BACKEND}) elseif (${THRUST_IMPL} STREQUAL "ROCM") if (SDK_DIR) find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) @@ -88,4 +101,4 @@ macro(setup) endmacro() - \ No newline at end of file +