diff --git a/CMakeLists.txt b/CMakeLists.txt index 797a9c0..42abfd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(TBB TBB TBBStream.cpp) +register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/THRUST.cmake b/THRUST.cmake new file mode 100644 index 0000000..1b94bf5 --- /dev/null +++ b/THRUST.cmake @@ -0,0 +1,87 @@ + +register_flag_optional(THRUST_IMPL + "Which Thrust implementation to use, supported options include: + - CUDA (via https://github.com/NVIDIA/thrust) + - ROCM (VIA https://github.com/ROCmSoftwarePlatform/rocThrust) + " + "CUDA") + +register_flag_optional(SDK_DIR + "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include/thrust/` for NVHPC, `/opt/rocm` for ROCm)" + "") + +register_flag_optional(BACKEND + "[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends: + - CUDA + - OMP + - TBB + " + "CUDA") + +register_flag_optional(CMAKE_CUDA_COMPILER + "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" + "") + +# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly +register_flag_optional(CUDA_ARCH + "[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc" + "") + +register_flag_optional(CUDA_EXTRA_FLAGS + "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" + "") + + +macro(setup) + set(CMAKE_CXX_STANDARD 14) + + if (${THRUST_IMPL} STREQUAL "CUDA") + + # see CUDA.cmake, we're only adding a few Thrust related libraries here + + if (POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif () + + # add -forward-unknown-to-host-compiler for compatibility reasons + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + enable_language(CUDA) + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG + # appended later + wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) + + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + + + if (SDK_DIR) + find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) + find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) + else () + find_package(CUB REQUIRED CONFIG) + find_package(Thrust REQUIRED CONFIG) + endif () + + message(STATUS "Using Thrust backend: ${BACKEND}") + + # this creates the interface that we can link to + thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + + register_link_library(Thrust) + elseif (${THRUST_IMPL} STREQUAL "ROCM") + if (SDK_DIR) + find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) + find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust) + else () + find_package(rocprim REQUIRED CONFIG) + find_package(rocthrust REQUIRED CONFIG) + endif () + register_link_library(roc::rocthrust) + else () + message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}") + endif () + + +endmacro() + + + \ No newline at end of file diff --git a/ThrustStream.cu b/ThrustStream.cu new file mode 100644 index 0000000..3a57ab0 --- /dev/null +++ b/ThrustStream.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "ThrustStream.h" +#include +#include +#include +#include + +static inline void synchronise() +{ +// rocThrust doesn't synchronise between thrust calls +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + hipDeviceSynchronize(); +#endif +} + +template +ThrustStream::ThrustStream(const int ARRAY_SIZE, int device) + : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { + std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device) << std::endl; + std::cout << "Thrust version: " << THRUST_VERSION << std::endl; + +#if THRUST_DEVICE_SYSTEM == 0 + // as per Thrust docs, 0 is reserved for undefined backend + std::cout << "Thrust backend: undefined" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + std::cout << "Thrust backend: CUDA" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP + std::cout << "Thrust backend: OMP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP + std::cout << "Thrust backend: CPP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#else + +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + std::cout << "Thrust backend: HIP" << std::endl; +#else + std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl; +#endif + +#endif + +} + +template +void ThrustStream::init_arrays(T initA, T initB, T initC) +{ + thrust::fill(a.begin(), a.end(), initA); + thrust::fill(b.begin(), b.end(), initB); + thrust::fill(c.begin(), c.end(), initC); + synchronise(); +} + +template +void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + thrust::copy(a.begin(), a.end(), h_a.begin()); + thrust::copy(b.begin(), b.end(), h_b.begin()); + thrust::copy(c.begin(), c.end(), h_c.begin()); +} + +template +void ThrustStream::copy() +{ + thrust::copy(a.begin(), a.end(),c.begin()); + synchronise(); +} + +template +void ThrustStream::mul() +{ + const T scalar = startScalar; + thrust::transform( + c.begin(), + c.end(), + b.begin(), + [=] __device__ __host__ (const T &ci){ + return ci * scalar; + } + ); + synchronise(); +} + +template +void ThrustStream::add() +{ + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())), + c.begin(), + thrust::make_zip_function( + [] __device__ __host__ (const T& ai, const T& bi){ + return ai + bi; + }) + ); + synchronise(); +} + +template +void ThrustStream::triad() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& bi, const T& ci){ + return bi + scalar * ci; + }) + ); + synchronise(); +} + +template +void ThrustStream::nstream() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ + return ai + bi + scalar * ci; + }) + ); + synchronise(); +} + +template +T ThrustStream::dot() +{ + return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); +} + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ + (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) + +#ifdef __NVCC__ +#define IMPL_FN__(fn) cuda ## fn +#define IMPL_TYPE__(tpe) cuda ## tpe +#elif defined(__HIP_PLATFORM_HCC__) +#define IMPL_FN__(fn) hip ## fn +#define IMPL_TYPE__(tpe) hip ## tpe ## _t +#else +# error Unsupported compiler for Thrust +#endif + +void check_error(void) +{ + IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); + if (err != IMPL_FN__(Success)) + { + std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl; + exit(err); + } +} + +void listDevices(void) +{ + // Get number of devices + int count; + IMPL_FN__(GetDeviceCount(&count)); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + IMPL_TYPE__(DeviceProp) props = {}; + IMPL_FN__(GetDeviceProperties(&props, device)); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + IMPL_FN__(SetDevice(device)); + check_error(); + int driver; + IMPL_FN__(DriverGetVersion(&driver)); + check_error(); + return std::to_string(driver); +} + +#undef IMPL_FN__ +#undef IMPL_TPE__ + +#else + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("(device name unavailable)"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("(device driver unavailable)"); +} + +#endif + +template class ThrustStream; +template class ThrustStream; + diff --git a/ThrustStream.h b/ThrustStream.h new file mode 100644 index 0000000..f87ace7 --- /dev/null +++ b/ThrustStream.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Thrust" + +template +class ThrustStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + thrust::device_vector a; + thrust::device_vector b; + thrust::device_vector c; + + public: + ThrustStream(const int, int); + ~ThrustStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 00ca718..0a162f2 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -171,6 +171,11 @@ build_gcc() { # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_ARCH=$NV_ARCH" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + } build_clang() { @@ -220,6 +225,8 @@ build_aomp() { build_hip() { run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + + run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } build_icpx() { diff --git a/main.cpp b/main.cpp index de301ce..2791bdc 100644 --- a/main.cpp +++ b/main.cpp @@ -27,6 +27,8 @@ #include "STD20Stream.hpp" #elif defined(TBB) #include "TBBStream.hpp" +#elif defined(THRUST) +#include "ThrustStream.h" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -272,6 +274,10 @@ void run() // Use the C++20 implementation stream = new TBBStream(ARRAY_SIZE, deviceIndex); +#elif defined(THRUST) + // Use the Thrust implementation + stream = new ThrustStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex);