diff --git a/CMakeLists.txt b/CMakeLists.txt index 797a9c0..42abfd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(TBB TBB TBBStream.cpp) +register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/THRUST.cmake b/THRUST.cmake new file mode 100644 index 0000000..0c286c2 --- /dev/null +++ b/THRUST.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(THRUST_IMPL + "Which Thrust implementation to use, supported options include: + - CUDA (via https://github.com/NVIDIA/thrust) + - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust) + " + "CUDA") + +register_flag_optional(SDK_DIR + "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)" + "") + +register_flag_optional(BACKEND + "[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends: + - CUDA + - OMP + - TBB + " + "CUDA") + +register_flag_optional(CMAKE_CUDA_COMPILER + "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" + "") + +# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly +register_flag_optional(CUDA_ARCH + "[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc" + "") + +register_flag_optional(CUDA_EXTRA_FLAGS + "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" + "") + + +macro(setup) + set(CMAKE_CXX_STANDARD 14) + + if (${THRUST_IMPL} STREQUAL "CUDA") + + # see CUDA.cmake, we're only adding a few Thrust related libraries here + + if (POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif () + + # add -forward-unknown-to-host-compiler for compatibility reasons + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + enable_language(CUDA) + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG + # appended later + wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) + + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + + + if (SDK_DIR) + find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) + find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) + else () + find_package(CUB REQUIRED CONFIG) + find_package(Thrust REQUIRED CONFIG) + endif () + + message(STATUS "Using Thrust backend: ${BACKEND}") + + # this creates the interface that we can link to + thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + + register_link_library(Thrust) + elseif (${THRUST_IMPL} STREQUAL "ROCM") + if (SDK_DIR) + find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) + find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust) + else () + find_package(rocprim REQUIRED CONFIG) + find_package(rocthrust REQUIRED CONFIG) + endif () + + # for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them + set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX) + + register_link_library(roc::rocthrust) + else () + message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}") + endif () + + +endmacro() + + + \ No newline at end of file diff --git a/ThrustStream.cu b/ThrustStream.cu new file mode 100644 index 0000000..3a57ab0 --- /dev/null +++ b/ThrustStream.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "ThrustStream.h" +#include +#include +#include +#include + +static inline void synchronise() +{ +// rocThrust doesn't synchronise between thrust calls +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + hipDeviceSynchronize(); +#endif +} + +template +ThrustStream::ThrustStream(const int ARRAY_SIZE, int device) + : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { + std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device) << std::endl; + std::cout << "Thrust version: " << THRUST_VERSION << std::endl; + +#if THRUST_DEVICE_SYSTEM == 0 + // as per Thrust docs, 0 is reserved for undefined backend + std::cout << "Thrust backend: undefined" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + std::cout << "Thrust backend: CUDA" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP + std::cout << "Thrust backend: OMP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP + std::cout << "Thrust backend: CPP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#else + +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + std::cout << "Thrust backend: HIP" << std::endl; +#else + std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl; +#endif + +#endif + +} + +template +void ThrustStream::init_arrays(T initA, T initB, T initC) +{ + thrust::fill(a.begin(), a.end(), initA); + thrust::fill(b.begin(), b.end(), initB); + thrust::fill(c.begin(), c.end(), initC); + synchronise(); +} + +template +void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + thrust::copy(a.begin(), a.end(), h_a.begin()); + thrust::copy(b.begin(), b.end(), h_b.begin()); + thrust::copy(c.begin(), c.end(), h_c.begin()); +} + +template +void ThrustStream::copy() +{ + thrust::copy(a.begin(), a.end(),c.begin()); + synchronise(); +} + +template +void ThrustStream::mul() +{ + const T scalar = startScalar; + thrust::transform( + c.begin(), + c.end(), + b.begin(), + [=] __device__ __host__ (const T &ci){ + return ci * scalar; + } + ); + synchronise(); +} + +template +void ThrustStream::add() +{ + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())), + c.begin(), + thrust::make_zip_function( + [] __device__ __host__ (const T& ai, const T& bi){ + return ai + bi; + }) + ); + synchronise(); +} + +template +void ThrustStream::triad() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& bi, const T& ci){ + return bi + scalar * ci; + }) + ); + synchronise(); +} + +template +void ThrustStream::nstream() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ + return ai + bi + scalar * ci; + }) + ); + synchronise(); +} + +template +T ThrustStream::dot() +{ + return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); +} + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ + (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) + +#ifdef __NVCC__ +#define IMPL_FN__(fn) cuda ## fn +#define IMPL_TYPE__(tpe) cuda ## tpe +#elif defined(__HIP_PLATFORM_HCC__) +#define IMPL_FN__(fn) hip ## fn +#define IMPL_TYPE__(tpe) hip ## tpe ## _t +#else +# error Unsupported compiler for Thrust +#endif + +void check_error(void) +{ + IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); + if (err != IMPL_FN__(Success)) + { + std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl; + exit(err); + } +} + +void listDevices(void) +{ + // Get number of devices + int count; + IMPL_FN__(GetDeviceCount(&count)); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + IMPL_TYPE__(DeviceProp) props = {}; + IMPL_FN__(GetDeviceProperties(&props, device)); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + IMPL_FN__(SetDevice(device)); + check_error(); + int driver; + IMPL_FN__(DriverGetVersion(&driver)); + check_error(); + return std::to_string(driver); +} + +#undef IMPL_FN__ +#undef IMPL_TPE__ + +#else + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("(device name unavailable)"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("(device driver unavailable)"); +} + +#endif + +template class ThrustStream; +template class ThrustStream; + diff --git a/ThrustStream.h b/ThrustStream.h new file mode 100644 index 0000000..f87ace7 --- /dev/null +++ b/ThrustStream.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Thrust" + +template +class ThrustStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + thrust::device_vector a; + thrust::device_vector b; + thrust::device_vector c; + + public: + ThrustStream(const int, int); + ~ThrustStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index fa3b2d2..7294905 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -135,17 +135,20 @@ setup_aocc() { setup_nvhpc() { echo "Preparing Nvidia HPC SDK" local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" +# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_215_Linux_x86_64_cuda_11.3/install_components/Linux_x86_64/21.5" + local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" local bin_dir="$sdk_dir/compilers/bin" "$bin_dir/makelocalrc" "$bin_dir" -x + export_var NVHPC_SDK_DIR "$sdk_dir" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.3/bin/nvcc" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.3" + export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + echo "Installed CUDA versions:" ls "$sdk_dir/cuda" verify_bin_exists "$NVHPC_NVCXX" @@ -249,10 +252,11 @@ setup_clang_gcc() { setup_rocm() { wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list + echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update -qq - sudo apt-get install -y -qq rocm-dev + sudo apt-get install -y -qq rocm-dev rocthrust-dev export_var ROCM_PATH "/opt/rocm" + export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" verify_bin_exists "$HIP_CXX" "$HIP_CXX" --version diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 00ca718..a61834c 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -57,7 +57,7 @@ run_build() { local cmake_code=$? "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" - "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" + "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" local cmake_code=$? set -e @@ -92,11 +92,11 @@ run_build() { # GCC_CXX="/usr/bin/g++" # CLANG_CXX="/usr/bin/clang++" -# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x +# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" +# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++" +# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.2/bin/nvcc" +# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.2" +# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++" @@ -124,7 +124,7 @@ run_build() { AMD_ARCH="gfx_903" NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.3,cc80" +NV_ARCH_CCXY="cuda11.4,cc80" build_gcc() { local name="gcc_build" @@ -171,6 +171,28 @@ build_gcc() { # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_ARCH=$NV_ARCH" + + # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required="3.15.0" + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + + # FIXME CUDA Thrust + TBB throws the following error: + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined + + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + else + echo "CMake version ${current} < ${required}, skipping Thrust models" + fi + } build_clang() { @@ -193,7 +215,7 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB @@ -219,7 +241,11 @@ build_aomp() { } build_hip() { - run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + local name="hip_build" + + run_build $name "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + + run_build $name "${GCC_CXX:?}" THRUST "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } build_icpx() { diff --git a/main.cpp b/main.cpp index de301ce..2791bdc 100644 --- a/main.cpp +++ b/main.cpp @@ -27,6 +27,8 @@ #include "STD20Stream.hpp" #elif defined(TBB) #include "TBBStream.hpp" +#elif defined(THRUST) +#include "ThrustStream.h" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -272,6 +274,10 @@ void run() // Use the C++20 implementation stream = new TBBStream(ARRAY_SIZE, deviceIndex); +#elif defined(THRUST) + // Use the Thrust implementation + stream = new ThrustStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex);