From a35c7b4bea7d5cb81e172abb2c4c988f4aac392e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:33:17 +0000 Subject: [PATCH 1/9] Fix CUDA memory check for large array sizes Closes #123 --- src/cuda/CUDAStream.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index b467d00..778a044 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -51,7 +51,7 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers From 5645b0290d13d401016386d8d9f01ea9567286a9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:36:20 +0000 Subject: [PATCH 2/9] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 903cb02..2958793 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to this project will be documented in this file. ## Unreleased -- None +- Fix CUDA memory limit check. ## [v4.0] - 2021-12-22 From e77a34158ce3ee00c11c66d0bf0a0e05c0c3ea7b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:37:58 +0000 Subject: [PATCH 3/9] fix memory limit check for HIP --- src/hip/HIPStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index fbc3b71..6aed1ee 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -54,7 +54,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers From 7b2bd5427c7fbbe56620eb97d87c1c5f6f047ea0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 31 Mar 2022 14:50:10 +0100 Subject: [PATCH 4/9] Fix missing counting iterator operators for stdpar --- src/std-indices/STDIndicesStream.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index bc068aa..26c7cb0 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -34,6 +34,7 @@ public: iterator& operator++() { num++; return *this; } iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } iterator operator+(const value_type v) const { return iterator(num + v); } + iterator operator+=(int x) { iterator retval = *this; this->num+=x; return retval; } bool operator==(iterator other) const { return num == other.num; } bool operator!=(iterator other) const { return *this != other; } From 6185d3aca6e89b064b599fa2f83e19272a7a0e13 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 1 Apr 2022 10:51:24 +0100 Subject: [PATCH 5/9] Use long double for check solution in case of very large problem sizes --- CHANGELOG.md | 1 + src/main.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2958793..cc135f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## Unreleased - Fix CUDA memory limit check. +- Use long double for `check_solution` in case of large problem size. ## [v4.0] - 2021-12-22 diff --git a/src/main.cpp b/src/main.cpp index 3035da0..c9d7694 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -487,15 +487,15 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error - double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size(); - double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); errB /= b.size(); - double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); errC /= c.size(); - double errSum = fabs((sum - goldSum)/goldSum); + long double errSum = fabs((sum - goldSum)/goldSum); - double epsi = std::numeric_limits::epsilon() * 100.0; + long double epsi = std::numeric_limits::epsilon() * 100.0; if (errA > epsi) std::cerr From fdb0ef8af846af647018c480e5e413f330532b63 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 7 Apr 2022 23:22:54 +0100 Subject: [PATCH 6/9] Bump CI NVHPC version --- src/ci-prepare-bionic.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 656d338..0684f35 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -133,21 +133,26 @@ setup_aocc() { } setup_nvhpc() { - echo "Preparing Nvidia HPC SDK" - local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + echo "Preparing Nvidia HPC SDK" + local nvhpc_ver="22.3" + local nvhpc_release="2022_223" + local cuda_ver="11.6" + + local tarball="nvhpc_$nvhpc_ver.tar.gz" + + local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" + local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver" local bin_dir="$sdk_dir/compilers/bin" "$bin_dir/makelocalrc" "$bin_dir" -x export_var NVHPC_SDK_DIR "$sdk_dir" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver" export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + export_var NVHPC_NVCC "$bin_dir/nvcc" +# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc" echo "Installed CUDA versions:" ls "$sdk_dir/cuda" From b27def135e9a1eb46e53c9d45396dea9077be204 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 8 Apr 2022 05:34:15 +0100 Subject: [PATCH 7/9] Sync CUDA version with CI runner --- src/ci-prepare-bionic.sh | 1 + src/ci-test-compile.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 0684f35..78bbd33 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -152,6 +152,7 @@ setup_nvhpc() { export_var NVHPC_NVCXX "$bin_dir/nvc++" export_var NVHPC_NVCC "$bin_dir/nvcc" + export_var NVHPC_CUDA_VER "$cuda_ver" # export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc" echo "Installed CUDA versions:" diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 9388643..7e17379 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -122,7 +122,7 @@ run_build() { AMD_ARCH="gfx_903" NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.4,cc80" +NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80" build_gcc() { local name="gcc_build" From 0f264081d75ddf315ecff461af17c8db6e3d4b78 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 8 Apr 2022 19:43:15 +0100 Subject: [PATCH 8/9] Fix Thrust/CUB path --- src/ci-test-compile.sh | 8 ++++---- src/thrust/model.cmake | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 7e17379..a7c5bab 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -175,9 +175,9 @@ build_gcc() { local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) local required="3.15.0" if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" # FIXME CUDA Thrust + TBB throws the following error: # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined @@ -187,7 +187,7 @@ build_gcc() { # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined - # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB" else echo "CMake version ${current} < ${required}, skipping Thrust models" fi diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 0c286c2..2d687c7 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -53,6 +53,9 @@ macro(setup) message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake` + # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` + # same thing for thrust if (SDK_DIR) find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) From 1d9cde42b00a42428b7e3b1043c0c1acc0af2b22 Mon Sep 17 00:00:00 2001 From: NoseKnowsAll Date: Wed, 20 Jul 2022 18:10:15 -0500 Subject: [PATCH 9/9] Reflect updated model options in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index df95582..7be3550 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust ``` #### Overriding default flags