Initial Thrust implementation
This commit is contained in:
parent
8f9ca7baa7
commit
a66696d971
@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp)
|
||||
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
||||
register_model(RAJA USE_RAJA RAJAStream.cpp)
|
||||
register_model(TBB TBB TBBStream.cpp)
|
||||
register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
|
||||
|
||||
|
||||
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
||||
|
||||
87
THRUST.cmake
Normal file
87
THRUST.cmake
Normal file
@ -0,0 +1,87 @@
|
||||
|
||||
register_flag_optional(THRUST_IMPL
|
||||
"Which Thrust implementation to use, supported options include:
|
||||
- CUDA (via https://github.com/NVIDIA/thrust)
|
||||
- ROCM (VIA https://github.com/ROCmSoftwarePlatform/rocThrust)
|
||||
"
|
||||
"CUDA")
|
||||
|
||||
register_flag_optional(SDK_DIR
|
||||
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include/thrust/` for NVHPC, `/opt/rocm` for ROCm)"
|
||||
"")
|
||||
|
||||
register_flag_optional(BACKEND
|
||||
"[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
|
||||
- CUDA
|
||||
- OMP
|
||||
- TBB
|
||||
"
|
||||
"CUDA")
|
||||
|
||||
register_flag_optional(CMAKE_CUDA_COMPILER
|
||||
"[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
|
||||
"")
|
||||
|
||||
# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
|
||||
register_flag_optional(CUDA_ARCH
|
||||
"[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
|
||||
"")
|
||||
|
||||
register_flag_optional(CUDA_EXTRA_FLAGS
|
||||
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
|
||||
"")
|
||||
|
||||
|
||||
macro(setup)
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
if (${THRUST_IMPL} STREQUAL "CUDA")
|
||||
|
||||
# see CUDA.cmake, we're only adding a few Thrust related libraries here
|
||||
|
||||
if (POLICY CMP0104)
|
||||
cmake_policy(SET CMP0104 OLD)
|
||||
endif ()
|
||||
|
||||
# add -forward-unknown-to-host-compiler for compatibility reasons
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
|
||||
enable_language(CUDA)
|
||||
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
|
||||
# appended later
|
||||
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
|
||||
|
||||
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
|
||||
|
||||
|
||||
if (SDK_DIR)
|
||||
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
|
||||
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
|
||||
else ()
|
||||
find_package(CUB REQUIRED CONFIG)
|
||||
find_package(Thrust REQUIRED CONFIG)
|
||||
endif ()
|
||||
|
||||
message(STATUS "Using Thrust backend: ${BACKEND}")
|
||||
|
||||
# this creates the interface that we can link to
|
||||
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
|
||||
|
||||
register_link_library(Thrust)
|
||||
elseif (${THRUST_IMPL} STREQUAL "ROCM")
|
||||
if (SDK_DIR)
|
||||
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
|
||||
find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
|
||||
else ()
|
||||
find_package(rocprim REQUIRED CONFIG)
|
||||
find_package(rocthrust REQUIRED CONFIG)
|
||||
endif ()
|
||||
register_link_library(roc::rocthrust)
|
||||
else ()
|
||||
message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
|
||||
endif ()
|
||||
|
||||
|
||||
endmacro()
|
||||
|
||||
|
||||
|
||||
235
ThrustStream.cu
Normal file
235
ThrustStream.cu
Normal file
@ -0,0 +1,235 @@
|
||||
// Copyright (c) 2020 Tom Deakin
|
||||
// University of Bristol HPC
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#include "ThrustStream.h"
|
||||
#include <thrust/inner_product.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/iterator/zip_iterator.h>
|
||||
#include <thrust/zip_function.h>
|
||||
|
||||
static inline void synchronise()
|
||||
{
|
||||
// rocThrust doesn't synchronise between thrust calls
|
||||
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
|
||||
hipDeviceSynchronize();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T>
|
||||
ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
|
||||
: array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
|
||||
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
|
||||
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
|
||||
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
|
||||
|
||||
#if THRUST_DEVICE_SYSTEM == 0
|
||||
// as per Thrust docs, 0 is reserved for undefined backend
|
||||
std::cout << "Thrust backend: undefined" << std::endl;
|
||||
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
|
||||
std::cout << "Thrust backend: CUDA" << std::endl;
|
||||
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
|
||||
std::cout << "Thrust backend: OMP" << std::endl;
|
||||
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
|
||||
std::cout << "Thrust backend: TBB" << std::endl;
|
||||
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
|
||||
std::cout << "Thrust backend: CPP" << std::endl;
|
||||
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
|
||||
std::cout << "Thrust backend: TBB" << std::endl;
|
||||
#else
|
||||
|
||||
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
|
||||
std::cout << "Thrust backend: HIP" << std::endl;
|
||||
#else
|
||||
std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
thrust::fill(a.begin(), a.end(), initA);
|
||||
thrust::fill(b.begin(), b.end(), initB);
|
||||
thrust::fill(c.begin(), c.end(), initC);
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
thrust::copy(a.begin(), a.end(), h_a.begin());
|
||||
thrust::copy(b.begin(), b.end(), h_b.begin());
|
||||
thrust::copy(c.begin(), c.end(), h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::copy()
|
||||
{
|
||||
thrust::copy(a.begin(), a.end(),c.begin());
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::mul()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
thrust::transform(
|
||||
c.begin(),
|
||||
c.end(),
|
||||
b.begin(),
|
||||
[=] __device__ __host__ (const T &ci){
|
||||
return ci * scalar;
|
||||
}
|
||||
);
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::add()
|
||||
{
|
||||
thrust::transform(
|
||||
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
|
||||
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
|
||||
c.begin(),
|
||||
thrust::make_zip_function(
|
||||
[] __device__ __host__ (const T& ai, const T& bi){
|
||||
return ai + bi;
|
||||
})
|
||||
);
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::triad()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
thrust::transform(
|
||||
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
|
||||
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
|
||||
a.begin(),
|
||||
thrust::make_zip_function(
|
||||
[=] __device__ __host__ (const T& bi, const T& ci){
|
||||
return bi + scalar * ci;
|
||||
})
|
||||
);
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ThrustStream<T>::nstream()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
thrust::transform(
|
||||
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
|
||||
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
|
||||
a.begin(),
|
||||
thrust::make_zip_function(
|
||||
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
|
||||
return ai + bi + scalar * ci;
|
||||
})
|
||||
);
|
||||
synchronise();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T ThrustStream<T>::dot()
|
||||
{
|
||||
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
|
||||
}
|
||||
|
||||
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
|
||||
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
|
||||
|
||||
#ifdef __NVCC__
|
||||
#define IMPL_FN__(fn) cuda ## fn
|
||||
#define IMPL_TYPE__(tpe) cuda ## tpe
|
||||
#elif defined(__HIP_PLATFORM_HCC__)
|
||||
#define IMPL_FN__(fn) hip ## fn
|
||||
#define IMPL_TYPE__(tpe) hip ## tpe ## _t
|
||||
#else
|
||||
# error Unsupported compiler for Thrust
|
||||
#endif
|
||||
|
||||
void check_error(void)
|
||||
{
|
||||
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
|
||||
if (err != IMPL_FN__(Success))
|
||||
{
|
||||
std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl;
|
||||
exit(err);
|
||||
}
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
// Get number of devices
|
||||
int count;
|
||||
IMPL_FN__(GetDeviceCount(&count));
|
||||
check_error();
|
||||
|
||||
// Print device names
|
||||
if (count == 0)
|
||||
{
|
||||
std::cerr << "No devices found." << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << std::endl;
|
||||
std::cout << "Devices:" << std::endl;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
std::cout << i << ": " << getDeviceName(i) << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string getDeviceName(const int device)
|
||||
{
|
||||
IMPL_TYPE__(DeviceProp) props = {};
|
||||
IMPL_FN__(GetDeviceProperties(&props, device));
|
||||
check_error();
|
||||
return std::string(props.name);
|
||||
}
|
||||
|
||||
|
||||
std::string getDeviceDriver(const int device)
|
||||
{
|
||||
IMPL_FN__(SetDevice(device));
|
||||
check_error();
|
||||
int driver;
|
||||
IMPL_FN__(DriverGetVersion(&driver));
|
||||
check_error();
|
||||
return std::to_string(driver);
|
||||
}
|
||||
|
||||
#undef IMPL_FN__
|
||||
#undef IMPL_TPE__
|
||||
|
||||
#else
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
std::cout << "0: CPU" << std::endl;
|
||||
}
|
||||
|
||||
std::string getDeviceName(const int)
|
||||
{
|
||||
return std::string("(device name unavailable)");
|
||||
}
|
||||
|
||||
std::string getDeviceDriver(const int)
|
||||
{
|
||||
return std::string("(device driver unavailable)");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template class ThrustStream<float>;
|
||||
template class ThrustStream<double>;
|
||||
|
||||
43
ThrustStream.h
Normal file
43
ThrustStream.h
Normal file
@ -0,0 +1,43 @@
|
||||
// Copyright (c) 2020 Tom Deakin
|
||||
// University of Bristol HPC
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "Thrust"
|
||||
|
||||
template <class T>
|
||||
class ThrustStream : public Stream<T>
|
||||
{
|
||||
protected:
|
||||
// Size of arrays
|
||||
int array_size;
|
||||
|
||||
thrust::device_vector<T> a;
|
||||
thrust::device_vector<T> b;
|
||||
thrust::device_vector<T> c;
|
||||
|
||||
public:
|
||||
ThrustStream(const int, int);
|
||||
~ThrustStream() = default;
|
||||
|
||||
virtual void copy() override;
|
||||
virtual void add() override;
|
||||
virtual void mul() override;
|
||||
virtual void triad() override;
|
||||
virtual void nstream() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
@ -171,6 +171,11 @@ build_gcc() {
|
||||
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||
# -DCUDA_ARCH=$NV_ARCH"
|
||||
|
||||
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
|
||||
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
||||
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||
|
||||
}
|
||||
|
||||
build_clang() {
|
||||
@ -220,6 +225,8 @@ build_aomp() {
|
||||
|
||||
build_hip() {
|
||||
run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
|
||||
|
||||
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
|
||||
}
|
||||
|
||||
build_icpx() {
|
||||
|
||||
6
main.cpp
6
main.cpp
@ -27,6 +27,8 @@
|
||||
#include "STD20Stream.hpp"
|
||||
#elif defined(TBB)
|
||||
#include "TBBStream.hpp"
|
||||
#elif defined(THRUST)
|
||||
#include "ThrustStream.h"
|
||||
#elif defined(HIP)
|
||||
#include "HIPStream.h"
|
||||
#elif defined(HC)
|
||||
@ -272,6 +274,10 @@ void run()
|
||||
// Use the C++20 implementation
|
||||
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
#elif defined(THRUST)
|
||||
// Use the Thrust implementation
|
||||
stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
#elif defined(ACC)
|
||||
// Use the OpenACC implementation
|
||||
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user