Merge pull request #111 from UoB-HPC/thrust

Thrust Implementation
This commit is contained in:
Tom Deakin 2021-11-25 13:17:48 +00:00 committed by GitHub
commit 53547ff664
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 422 additions and 16 deletions

View File

@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

91
THRUST.cmake Normal file
View File

@ -0,0 +1,91 @@
register_flag_optional(THRUST_IMPL
"Which Thrust implementation to use, supported options include:
- CUDA (via https://github.com/NVIDIA/thrust)
- ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
"
"CUDA")
register_flag_optional(SDK_DIR
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
"")
register_flag_optional(BACKEND
"[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
- CUDA
- OMP
- TBB
"
"CUDA")
register_flag_optional(CMAKE_CUDA_COMPILER
"[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
"")
# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
register_flag_optional(CUDA_ARCH
"[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")
register_flag_optional(CUDA_EXTRA_FLAGS
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")
macro(setup)
set(CMAKE_CXX_STANDARD 14)
if (${THRUST_IMPL} STREQUAL "CUDA")
# see CUDA.cmake, we're only adding a few Thrust related libraries here
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
if (SDK_DIR)
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
endif ()
message(STATUS "Using Thrust backend: ${BACKEND}")
# this creates the interface that we can link to
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
register_link_library(Thrust)
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
else ()
find_package(rocprim REQUIRED CONFIG)
find_package(rocthrust REQUIRED CONFIG)
endif ()
# for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them
set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX)
register_link_library(roc::rocthrust)
else ()
message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
endif ()
endmacro()

235
ThrustStream.cu Normal file
View File

@ -0,0 +1,235 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "ThrustStream.h"
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
static inline void synchronise()
{
// rocThrust doesn't synchronise between thrust calls
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
hipDeviceSynchronize();
#endif
}
template <class T>
ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
#if THRUST_DEVICE_SYSTEM == 0
// as per Thrust docs, 0 is reserved for undefined backend
std::cout << "Thrust backend: undefined" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
std::cout << "Thrust backend: CUDA" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
std::cout << "Thrust backend: OMP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
std::cout << "Thrust backend: CPP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#else
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
std::cout << "Thrust backend: HIP" << std::endl;
#else
std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
#endif
#endif
}
template <class T>
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
{
thrust::fill(a.begin(), a.end(), initA);
thrust::fill(b.begin(), b.end(), initB);
thrust::fill(c.begin(), c.end(), initC);
synchronise();
}
template <class T>
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
thrust::copy(a.begin(), a.end(), h_a.begin());
thrust::copy(b.begin(), b.end(), h_b.begin());
thrust::copy(c.begin(), c.end(), h_c.begin());
}
template <class T>
void ThrustStream<T>::copy()
{
thrust::copy(a.begin(), a.end(),c.begin());
synchronise();
}
template <class T>
void ThrustStream<T>::mul()
{
const T scalar = startScalar;
thrust::transform(
c.begin(),
c.end(),
b.begin(),
[=] __device__ __host__ (const T &ci){
return ci * scalar;
}
);
synchronise();
}
template <class T>
void ThrustStream<T>::add()
{
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
c.begin(),
thrust::make_zip_function(
[] __device__ __host__ (const T& ai, const T& bi){
return ai + bi;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::triad()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& bi, const T& ci){
return bi + scalar * ci;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::nstream()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
return ai + bi + scalar * ci;
})
);
synchronise();
}
template <class T>
T ThrustStream<T>::dot()
{
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
}
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)
#define IMPL_FN__(fn) hip ## fn
#define IMPL_TYPE__(tpe) hip ## tpe ## _t
#else
# error Unsupported compiler for Thrust
#endif
void check_error(void)
{
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
if (err != IMPL_FN__(Success))
{
std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl;
exit(err);
}
}
void listDevices(void)
{
// Get number of devices
int count;
IMPL_FN__(GetDeviceCount(&count));
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
IMPL_TYPE__(DeviceProp) props = {};
IMPL_FN__(GetDeviceProperties(&props, device));
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
IMPL_FN__(SetDevice(device));
check_error();
int driver;
IMPL_FN__(DriverGetVersion(&driver));
check_error();
return std::to_string(driver);
}
#undef IMPL_FN__
#undef IMPL_TPE__
#else
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("(device name unavailable)");
}
std::string getDeviceDriver(const int)
{
return std::string("(device driver unavailable)");
}
#endif
template class ThrustStream<float>;
template class ThrustStream<double>;

43
ThrustStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include <thrust/device_vector.h>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Thrust"
template <class T>
class ThrustStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
thrust::device_vector<T> a;
thrust::device_vector<T> b;
thrust::device_vector<T> c;
public:
ThrustStream(const int, int);
~ThrustStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -135,17 +135,20 @@ setup_aocc() {
setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_215_Linux_x86_64_cuda_11.3/install_components/Linux_x86_64/21.5"
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.3/bin/nvcc"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.3"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
echo "Installed CUDA versions:"
ls "$sdk_dir/cuda"
verify_bin_exists "$NVHPC_NVCXX"
@ -249,10 +252,11 @@ setup_clang_gcc() {
setup_rocm() {
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq
sudo apt-get install -y -qq rocm-dev
sudo apt-get install -y -qq rocm-dev rocthrust-dev
export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
verify_bin_exists "$HIP_CXX"
"$HIP_CXX" --version

View File

@ -57,7 +57,7 @@ run_build() {
local cmake_code=$?
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
local cmake_code=$?
set -e
@ -92,11 +92,11 @@ run_build() {
# GCC_CXX="/usr/bin/g++"
# CLANG_CXX="/usr/bin/clang++"
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++"
# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.2"
# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
# AOMP_CXX="/usr/lib/aomp/bin/clang++"
@ -124,7 +124,7 @@ run_build() {
AMD_ARCH="gfx_903"
NV_ARCH="sm_70"
NV_ARCH_CCXY="cuda11.3,cc80"
NV_ARCH_CCXY="cuda11.4,cc80"
build_gcc() {
local name="gcc_build"
@ -171,6 +171,28 @@ build_gcc() {
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
fi
}
build_clang() {
@ -193,7 +215,7 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
@ -219,7 +241,11 @@ build_aomp() {
}
build_hip() {
run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
local name="hip_build"
run_build $name "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${GCC_CXX:?}" THRUST "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
}
build_icpx() {

View File

@ -27,6 +27,8 @@
#include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(THRUST)
#include "ThrustStream.h"
#elif defined(HIP)
#include "HIPStream.h"
#elif defined(HC)
@ -272,6 +274,10 @@ void run()
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(THRUST)
// Use the Thrust implementation
stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);