Initial Thrust implementation

2021-11-11 23:11:04 +00:00 · 2021-11-11 23:11:04 +00:00 · a66696d971
commit a66696d971
parent 8f9ca7baa7
6 changed files with 379 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
 register_model(TBB TBB TBBStream.cpp)
+register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust


 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
--- a/THRUST.cmake
+++ b/THRUST.cmake
@ -0,0 +1,87 @@
+
+register_flag_optional(THRUST_IMPL
+        "Which Thrust implementation to use, supported options include:
+         - CUDA (via https://github.com/NVIDIA/thrust)
+         - ROCM (VIA https://github.com/ROCmSoftwarePlatform/rocThrust)
+        "
+        "CUDA")
+
+register_flag_optional(SDK_DIR
+        "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include/thrust/` for NVHPC, `/opt/rocm` for ROCm)"
+        "")
+
+register_flag_optional(BACKEND
+        "[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
+          - CUDA
+          - OMP
+          - TBB
+        "
+        "CUDA")
+
+register_flag_optional(CMAKE_CUDA_COMPILER
+        "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
+        "")
+
+# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
+register_flag_optional(CUDA_ARCH
+        "[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
+        "")
+
+register_flag_optional(CUDA_EXTRA_FLAGS
+        "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
+        "")
+
+
+macro(setup)
+    set(CMAKE_CXX_STANDARD 14)
+
+    if (${THRUST_IMPL} STREQUAL "CUDA")
+
+        # see CUDA.cmake, we're only adding a few Thrust related libraries here
+
+        if (POLICY CMP0104)
+            cmake_policy(SET CMP0104 OLD)
+        endif ()
+
+        # add -forward-unknown-to-host-compiler for compatibility reasons
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
+        enable_language(CUDA)
+        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
+        # appended later
+        wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
+
+        message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
+
+
+        if (SDK_DIR)
+            find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
+            find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
+        else ()
+            find_package(CUB REQUIRED CONFIG)
+            find_package(Thrust REQUIRED CONFIG)
+        endif ()
+
+        message(STATUS "Using Thrust backend: ${BACKEND}")
+
+        # this creates the interface that we can link to
+        thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
+
+        register_link_library(Thrust)
+    elseif (${THRUST_IMPL} STREQUAL "ROCM")
+        if (SDK_DIR)
+            find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
+            find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
+        else ()
+            find_package(rocprim REQUIRED CONFIG)
+            find_package(rocthrust REQUIRED CONFIG)
+        endif ()
+        register_link_library(roc::rocthrust)
+    else ()
+        message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
+    endif ()
+
+
+endmacro()
+
+
+ 
--- a/ThrustStream.cu
+++ b/ThrustStream.cu
@ -0,0 +1,235 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "ThrustStream.h"
+#include <thrust/inner_product.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/zip_function.h>
+
+static inline void synchronise()
+{
+// rocThrust doesn't synchronise between thrust calls
+#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
+  hipDeviceSynchronize();
+#endif
+}
+
+template <class T>
+ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
+    : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
+  std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
+  std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
+
+#if THRUST_DEVICE_SYSTEM == 0
+  // as per Thrust docs, 0 is reserved for undefined backend
+  std::cout << "Thrust backend: undefined" << std::endl;
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  std::cout << "Thrust backend: CUDA" << std::endl;
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+  std::cout << "Thrust backend: OMP" << std::endl;
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+  std::cout << "Thrust backend: TBB" << std::endl;
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+  std::cout << "Thrust backend: CPP" << std::endl;
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+  std::cout << "Thrust backend: TBB" << std::endl;
+#else
+
+#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
+  std::cout << "Thrust backend: HIP" << std::endl;
+#else
+  std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
+#endif
+
+#endif
+
+}
+
+template <class T>
+void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  thrust::fill(a.begin(), a.end(), initA);
+  thrust::fill(b.begin(), b.end(), initB);
+  thrust::fill(c.begin(), c.end(), initC);
+  synchronise();
+}
+
+template <class T>
+void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  thrust::copy(a.begin(), a.end(), h_a.begin());
+  thrust::copy(b.begin(), b.end(), h_b.begin());
+  thrust::copy(c.begin(), c.end(), h_c.begin());
+}
+
+template <class T>
+void ThrustStream<T>::copy()
+{
+  thrust::copy(a.begin(), a.end(),c.begin());
+  synchronise();
+}
+
+template <class T>
+void ThrustStream<T>::mul()
+{
+  const T scalar = startScalar;
+  thrust::transform(
+      c.begin(),
+      c.end(),
+      b.begin(),
+      [=] __device__ __host__ (const T &ci){
+        return ci * scalar;
+      }
+  );
+  synchronise();
+}
+
+template <class T>
+void ThrustStream<T>::add()
+{
+  thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
+      c.begin(),
+      thrust::make_zip_function(
+          [] __device__ __host__ (const T& ai, const T& bi){
+            return ai + bi;
+          })
+  );
+  synchronise();
+}
+
+template <class T>
+void ThrustStream<T>::triad()
+{
+  const T scalar = startScalar;
+  thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
+      a.begin(),
+      thrust::make_zip_function(
+          [=] __device__ __host__ (const T& bi, const T& ci){
+            return bi + scalar * ci;
+          })
+  );
+  synchronise();
+}
+
+template <class T>
+void ThrustStream<T>::nstream()
+{
+  const T scalar = startScalar;
+  thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
+      a.begin(),
+      thrust::make_zip_function(
+          [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
+            return ai + bi + scalar * ci;
+          })
+  );
+  synchronise();
+}
+
+template <class T>
+T ThrustStream<T>::dot()
+{
+  return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
+}
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
+    (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
+
+#ifdef __NVCC__
+#define IMPL_FN__(fn) cuda ## fn
+#define IMPL_TYPE__(tpe) cuda ## tpe
+#elif defined(__HIP_PLATFORM_HCC__)
+#define IMPL_FN__(fn) hip ## fn
+#define IMPL_TYPE__(tpe) hip ## tpe ## _t
+#else
+# error Unsupported compiler for Thrust
+#endif
+
+void check_error(void)
+{
+  IMPL_FN__(Error_t) err =  IMPL_FN__(GetLastError());
+  if (err !=  IMPL_FN__(Success))
+  {
+    std::cerr << "Error: " <<  IMPL_FN__(GetErrorString(err)) << std::endl;
+    exit(err);
+  }
+}
+
+void listDevices(void)
+{
+  // Get number of devices
+  int count;
+  IMPL_FN__(GetDeviceCount(&count));
+  check_error();
+
+  // Print device names
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < count; i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+std::string getDeviceName(const int device)
+{
+  IMPL_TYPE__(DeviceProp) props = {};
+  IMPL_FN__(GetDeviceProperties(&props, device));
+  check_error();
+  return std::string(props.name);
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  IMPL_FN__(SetDevice(device));
+  check_error();
+  int driver;
+  IMPL_FN__(DriverGetVersion(&driver));
+  check_error();
+  return std::to_string(driver);
+}
+
+#undef IMPL_FN__
+#undef IMPL_TPE__
+
+#else
+
+void listDevices(void)
+{
+  std::cout << "0: CPU" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("(device name unavailable)");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("(device driver unavailable)");
+}
+
+#endif
+
+template class ThrustStream<float>;
+template class ThrustStream<double>;
+
--- a/ThrustStream.h
+++ b/ThrustStream.h
@ -0,0 +1,43 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <thrust/device_vector.h>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "Thrust"
+
+template <class T>
+class ThrustStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    int array_size;
+
+    thrust::device_vector<T> a;
+    thrust::device_vector<T> b;
+    thrust::device_vector<T> c;
+
+  public:
+    ThrustStream(const int, int);
+    ~ThrustStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@ -171,6 +171,11 @@ build_gcc() {
 #  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
 #  -DCUDA_ARCH=$NV_ARCH"

+  run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
+  run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
+  run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
+  run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVSDK/cuda/include/thrust -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
+
 }

 build_clang() {
@ -220,6 +225,8 @@ build_aomp() {

 build_hip() {
  run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
+
+  run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
 }

 build_icpx() {
--- a/main.cpp
+++ b/main.cpp
@ -27,6 +27,8 @@
 #include "STD20Stream.hpp"
 #elif defined(TBB)
 #include "TBBStream.hpp"
+#elif defined(THRUST)
+#include "ThrustStream.h"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -272,6 +274,10 @@ void run()
  // Use the C++20 implementation
  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);

+#elif defined(THRUST)
+  // Use the Thrust implementation
+  stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex); 
+
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);