Merge branch 'develop' into main
This commit is contained in:
commit
57c8003621
@ -2,8 +2,13 @@
|
|||||||
All notable changes to this project will be documented in this file.
|
All notable changes to this project will be documented in this file.
|
||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
### Added
|
||||||
|
- Ability to build Kokkos and RAJA versions against existing packages.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- RAJA CUDA CMake build issues resolved.
|
- RAJA CUDA CMake build issues resolved.
|
||||||
|
- Fix CUDA memory limit check.
|
||||||
|
- Use long double for `check_solution` in case of large problem size.
|
||||||
|
|
||||||
## [v4.0] - 2021-12-22
|
## [v4.0] - 2021-12-22
|
||||||
|
|
||||||
|
|||||||
@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/<model>`.
|
|||||||
|
|
||||||
Currently available models are:
|
Currently available models are:
|
||||||
```
|
```
|
||||||
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Overriding default flags
|
#### Overriding default flags
|
||||||
|
|||||||
@ -133,21 +133,27 @@ setup_aocc() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
setup_nvhpc() {
|
setup_nvhpc() {
|
||||||
echo "Preparing Nvidia HPC SDK"
|
echo "Preparing Nvidia HPC SDK"
|
||||||
local tarball="nvhpc.tar.gz"
|
local nvhpc_ver="22.3"
|
||||||
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
|
local nvhpc_release="2022_223"
|
||||||
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
|
local cuda_ver="11.6"
|
||||||
|
|
||||||
|
local tarball="nvhpc_$nvhpc_ver.tar.gz"
|
||||||
|
|
||||||
|
local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz"
|
||||||
get_and_untar "$tarball" "$url"
|
get_and_untar "$tarball" "$url"
|
||||||
|
|
||||||
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
|
local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
|
||||||
local bin_dir="$sdk_dir/compilers/bin"
|
local bin_dir="$sdk_dir/compilers/bin"
|
||||||
"$bin_dir/makelocalrc" "$bin_dir" -x
|
"$bin_dir/makelocalrc" "$bin_dir" -x
|
||||||
|
|
||||||
export_var NVHPC_SDK_DIR "$sdk_dir"
|
export_var NVHPC_SDK_DIR "$sdk_dir"
|
||||||
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
|
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
|
||||||
|
|
||||||
export_var NVHPC_NVCXX "$bin_dir/nvc++"
|
export_var NVHPC_NVCXX "$bin_dir/nvc++"
|
||||||
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
|
export_var NVHPC_NVCC "$bin_dir/nvcc"
|
||||||
|
export_var NVHPC_CUDA_VER "$cuda_ver"
|
||||||
|
# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc"
|
||||||
|
|
||||||
echo "Installed CUDA versions:"
|
echo "Installed CUDA versions:"
|
||||||
ls "$sdk_dir/cuda"
|
ls "$sdk_dir/cuda"
|
||||||
|
|||||||
@ -122,7 +122,7 @@ run_build() {
|
|||||||
|
|
||||||
AMD_ARCH="gfx_903"
|
AMD_ARCH="gfx_903"
|
||||||
NV_ARCH="sm_70"
|
NV_ARCH="sm_70"
|
||||||
NV_ARCH_CCXY="cuda11.4,cc80"
|
NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
|
||||||
|
|
||||||
build_gcc() {
|
build_gcc() {
|
||||||
local name="gcc_build"
|
local name="gcc_build"
|
||||||
@ -175,9 +175,9 @@ build_gcc() {
|
|||||||
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
||||||
local required="3.15.0"
|
local required="3.15.0"
|
||||||
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
||||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
|
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
|
||||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||||
|
|
||||||
# FIXME CUDA Thrust + TBB throws the following error:
|
# FIXME CUDA Thrust + TBB throws the following error:
|
||||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
|
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
|
||||||
@ -187,7 +187,7 @@ build_gcc() {
|
|||||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
|
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
|
||||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
|
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
|
||||||
|
|
||||||
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
||||||
else
|
else
|
||||||
echo "CMake version ${current} < ${required}, skipping Thrust models"
|
echo "CMake version ${current} < ${required}, skipping Thrust models"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -51,7 +51,7 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
|
|||||||
// Check buffers fit on the device
|
// Check buffers fit on the device
|
||||||
cudaDeviceProp props;
|
cudaDeviceProp props;
|
||||||
cudaGetDeviceProperties(&props, 0);
|
cudaGetDeviceProperties(&props, 0);
|
||||||
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
|
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
|
||||||
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
||||||
|
|
||||||
// Create device buffers
|
// Create device buffers
|
||||||
|
|||||||
@ -54,7 +54,7 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
|
|||||||
// Check buffers fit on the device
|
// Check buffers fit on the device
|
||||||
hipDeviceProp_t props;
|
hipDeviceProp_t props;
|
||||||
hipGetDeviceProperties(&props, 0);
|
hipGetDeviceProperties(&props, 0);
|
||||||
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
|
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
|
||||||
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
||||||
|
|
||||||
// Create device buffers
|
// Create device buffers
|
||||||
|
|||||||
10
src/main.cpp
10
src/main.cpp
@ -487,15 +487,15 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
|
|||||||
goldSum = goldA * goldB * ARRAY_SIZE;
|
goldSum = goldA * goldB * ARRAY_SIZE;
|
||||||
|
|
||||||
// Calculate the average error
|
// Calculate the average error
|
||||||
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
|
long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
|
||||||
errA /= a.size();
|
errA /= a.size();
|
||||||
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
|
long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
|
||||||
errB /= b.size();
|
errB /= b.size();
|
||||||
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
|
long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
|
||||||
errC /= c.size();
|
errC /= c.size();
|
||||||
double errSum = fabs((sum - goldSum)/goldSum);
|
long double errSum = fabs((sum - goldSum)/goldSum);
|
||||||
|
|
||||||
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
|
long double epsi = std::numeric_limits<T>::epsilon() * 100.0;
|
||||||
|
|
||||||
if (errA > epsi)
|
if (errA > epsi)
|
||||||
std::cerr
|
std::cerr
|
||||||
|
|||||||
@ -34,6 +34,7 @@ public:
|
|||||||
iterator& operator++() { num++; return *this; }
|
iterator& operator++() { num++; return *this; }
|
||||||
iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
|
iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
|
||||||
iterator operator+(const value_type v) const { return iterator(num + v); }
|
iterator operator+(const value_type v) const { return iterator(num + v); }
|
||||||
|
iterator operator+=(int x) { iterator retval = *this; this->num+=x; return retval; }
|
||||||
|
|
||||||
bool operator==(iterator other) const { return num == other.num; }
|
bool operator==(iterator other) const { return num == other.num; }
|
||||||
bool operator!=(iterator other) const { return *this != other; }
|
bool operator!=(iterator other) const { return *this != other; }
|
||||||
|
|||||||
@ -53,6 +53,9 @@ macro(setup)
|
|||||||
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
|
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
|
||||||
|
|
||||||
|
|
||||||
|
# XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake`
|
||||||
|
# XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
|
||||||
|
# same thing for thrust
|
||||||
if (SDK_DIR)
|
if (SDK_DIR)
|
||||||
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
|
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
|
||||||
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
|
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user