Initial Thrust implementation

This commit is contained in:
Tom Lin 2021-11-11 23:11:04 +00:00
parent 8f9ca7baa7
commit a66696d971
6 changed files with 379 additions and 0 deletions

View File

@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp) register_model(TBB TBB TBBStream.cpp)
register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

87
THRUST.cmake Normal file
View File

@ -0,0 +1,87 @@
register_flag_optional(THRUST_IMPL
"Which Thrust implementation to use, supported options include:
- CUDA (via https://github.com/NVIDIA/thrust)
- ROCM (VIA https://github.com/ROCmSoftwarePlatform/rocThrust)
"
"CUDA")
register_flag_optional(SDK_DIR
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include/thrust/` for NVHPC, `/opt/rocm` for ROCm)"
"")
register_flag_optional(BACKEND
"[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
- CUDA
- OMP
- TBB
"
"CUDA")
register_flag_optional(CMAKE_CUDA_COMPILER
"[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
"")
# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
register_flag_optional(CUDA_ARCH
"[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")
register_flag_optional(CUDA_EXTRA_FLAGS
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")
macro(setup)
set(CMAKE_CXX_STANDARD 14)
if (${THRUST_IMPL} STREQUAL "CUDA")
# see CUDA.cmake, we're only adding a few Thrust related libraries here
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
if (SDK_DIR)
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
endif ()
message(STATUS "Using Thrust backend: ${BACKEND}")
# this creates the interface that we can link to
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
register_link_library(Thrust)
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
else ()
find_package(rocprim REQUIRED CONFIG)
find_package(rocthrust REQUIRED CONFIG)
endif ()
register_link_library(roc::rocthrust)
else ()
message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
endif ()
endmacro()

235
ThrustStream.cu Normal file
View File

@ -0,0 +1,235 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "ThrustStream.h"
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
static inline void synchronise()
{
// rocThrust doesn't synchronise between thrust calls
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
hipDeviceSynchronize();
#endif
}
template <class T>
ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
#if THRUST_DEVICE_SYSTEM == 0
// as per Thrust docs, 0 is reserved for undefined backend
std::cout << "Thrust backend: undefined" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
std::cout << "Thrust backend: CUDA" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
std::cout << "Thrust backend: OMP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
std::cout << "Thrust backend: CPP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#else
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
std::cout << "Thrust backend: HIP" << std::endl;
#else
std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
#endif
#endif
}
template <class T>
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
{
thrust::fill(a.begin(), a.end(), initA);
thrust::fill(b.begin(), b.end(), initB);
thrust::fill(c.begin(), c.end(), initC);
synchronise();
}
template <class T>
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
thrust::copy(a.begin(), a.end(), h_a.begin());
thrust::copy(b.begin(), b.end(), h_b.begin());
thrust::copy(c.begin(), c.end(), h_c.begin());
}
template <class T>
void ThrustStream<T>::copy()
{
thrust::copy(a.begin(), a.end(),c.begin());
synchronise();
}
template <class T>
void ThrustStream<T>::mul()
{
const T scalar = startScalar;
thrust::transform(
c.begin(),
c.end(),
b.begin(),
[=] __device__ __host__ (const T &ci){
return ci * scalar;
}
);
synchronise();
}
template <class T>
void ThrustStream<T>::add()
{
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
c.begin(),
thrust::make_zip_function(
[] __device__ __host__ (const T& ai, const T& bi){
return ai + bi;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::triad()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& bi, const T& ci){
return bi + scalar * ci;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::nstream()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
return ai + bi + scalar * ci;
})
);
synchronise();
}
template <class T>
T ThrustStream<T>::dot()
{
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
}
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)
#define IMPL_FN__(fn) hip ## fn
#define IMPL_TYPE__(tpe) hip ## tpe ## _t
#else
# error Unsupported compiler for Thrust
#endif
void check_error(void)
{
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
if (err != IMPL_FN__(Success))
{
std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl;
exit(err);
}
}
void listDevices(void)
{
// Get number of devices
int count;
IMPL_FN__(GetDeviceCount(&count));
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
IMPL_TYPE__(DeviceProp) props = {};
IMPL_FN__(GetDeviceProperties(&props, device));
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
IMPL_FN__(SetDevice(device));
check_error();
int driver;
IMPL_FN__(DriverGetVersion(&driver));
check_error();
return std::to_string(driver);
}
#undef IMPL_FN__
#undef IMPL_TPE__
#else
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("(device name unavailable)");
}
std::string getDeviceDriver(const int)
{
return std::string("(device driver unavailable)");
}
#endif
template class ThrustStream<float>;
template class ThrustStream<double>;

43
ThrustStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include <thrust/device_vector.h>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Thrust"
template <class T>
class ThrustStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
thrust::device_vector<T> a;
thrust::device_vector<T> b;
thrust::device_vector<T> c;
public:
ThrustStream(const int, int);
~ThrustStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -171,6 +171,11 @@ build_gcc() {
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH" # -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
} }
build_clang() { build_clang() {
@ -220,6 +225,8 @@ build_aomp() {
build_hip() { build_hip() {
run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
} }
build_icpx() { build_icpx() {

View File

@ -27,6 +27,8 @@
#include "STD20Stream.hpp" #include "STD20Stream.hpp"
#elif defined(TBB) #elif defined(TBB)
#include "TBBStream.hpp" #include "TBBStream.hpp"
#elif defined(THRUST)
#include "ThrustStream.h"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -272,6 +274,10 @@ void run()
// Use the C++20 implementation // Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex); stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(THRUST)
// Use the Thrust implementation
stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);