Merge pull request #111 from UoB-HPC/thrust

Thrust Implementation
This commit is contained in:
Tom Deakin 2021-11-25 13:17:48 +00:00 committed by GitHub
commit 53547ff664
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 422 additions and 16 deletions

View File

@ -113,6 +113,7 @@ register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp) register_model(TBB TBB TBBStream.cpp)
register_model(THRUST THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

91
THRUST.cmake Normal file
View File

@ -0,0 +1,91 @@
register_flag_optional(THRUST_IMPL
"Which Thrust implementation to use, supported options include:
- CUDA (via https://github.com/NVIDIA/thrust)
- ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
"
"CUDA")
register_flag_optional(SDK_DIR
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
"")
register_flag_optional(BACKEND
"[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends:
- CUDA
- OMP
- TBB
"
"CUDA")
register_flag_optional(CMAKE_CUDA_COMPILER
"[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
"")
# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
register_flag_optional(CUDA_ARCH
"[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")
register_flag_optional(CUDA_EXTRA_FLAGS
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")
macro(setup)
set(CMAKE_CXX_STANDARD 14)
if (${THRUST_IMPL} STREQUAL "CUDA")
# see CUDA.cmake, we're only adding a few Thrust related libraries here
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
if (SDK_DIR)
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
endif ()
message(STATUS "Using Thrust backend: ${BACKEND}")
# this creates the interface that we can link to
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
register_link_library(Thrust)
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust)
else ()
find_package(rocprim REQUIRED CONFIG)
find_package(rocthrust REQUIRED CONFIG)
endif ()
# for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them
set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX)
register_link_library(roc::rocthrust)
else ()
message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}")
endif ()
endmacro()

235
ThrustStream.cu Normal file
View File

@ -0,0 +1,235 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "ThrustStream.h"
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
static inline void synchronise()
{
// rocThrust doesn't synchronise between thrust calls
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
hipDeviceSynchronize();
#endif
}
template <class T>
ThrustStream<T>::ThrustStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) {
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
#if THRUST_DEVICE_SYSTEM == 0
// as per Thrust docs, 0 is reserved for undefined backend
std::cout << "Thrust backend: undefined" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
std::cout << "Thrust backend: CUDA" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
std::cout << "Thrust backend: OMP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
std::cout << "Thrust backend: CPP" << std::endl;
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
std::cout << "Thrust backend: TBB" << std::endl;
#else
#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
std::cout << "Thrust backend: HIP" << std::endl;
#else
std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl;
#endif
#endif
}
template <class T>
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
{
thrust::fill(a.begin(), a.end(), initA);
thrust::fill(b.begin(), b.end(), initB);
thrust::fill(c.begin(), c.end(), initC);
synchronise();
}
template <class T>
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
thrust::copy(a.begin(), a.end(), h_a.begin());
thrust::copy(b.begin(), b.end(), h_b.begin());
thrust::copy(c.begin(), c.end(), h_c.begin());
}
template <class T>
void ThrustStream<T>::copy()
{
thrust::copy(a.begin(), a.end(),c.begin());
synchronise();
}
template <class T>
void ThrustStream<T>::mul()
{
const T scalar = startScalar;
thrust::transform(
c.begin(),
c.end(),
b.begin(),
[=] __device__ __host__ (const T &ci){
return ci * scalar;
}
);
synchronise();
}
template <class T>
void ThrustStream<T>::add()
{
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
c.begin(),
thrust::make_zip_function(
[] __device__ __host__ (const T& ai, const T& bi){
return ai + bi;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::triad()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& bi, const T& ci){
return bi + scalar * ci;
})
);
synchronise();
}
template <class T>
void ThrustStream<T>::nstream()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
return ai + bi + scalar * ci;
})
);
synchronise();
}
template <class T>
T ThrustStream<T>::dot()
{
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
}
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)
#define IMPL_FN__(fn) hip ## fn
#define IMPL_TYPE__(tpe) hip ## tpe ## _t
#else
# error Unsupported compiler for Thrust
#endif
void check_error(void)
{
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
if (err != IMPL_FN__(Success))
{
std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl;
exit(err);
}
}
void listDevices(void)
{
// Get number of devices
int count;
IMPL_FN__(GetDeviceCount(&count));
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
IMPL_TYPE__(DeviceProp) props = {};
IMPL_FN__(GetDeviceProperties(&props, device));
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
IMPL_FN__(SetDevice(device));
check_error();
int driver;
IMPL_FN__(DriverGetVersion(&driver));
check_error();
return std::to_string(driver);
}
#undef IMPL_FN__
#undef IMPL_TPE__
#else
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("(device name unavailable)");
}
std::string getDeviceDriver(const int)
{
return std::string("(device driver unavailable)");
}
#endif
template class ThrustStream<float>;
template class ThrustStream<double>;

43
ThrustStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include <thrust/device_vector.h>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Thrust"
template <class T>
class ThrustStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
thrust::device_vector<T> a;
thrust::device_vector<T> b;
thrust::device_vector<T> c;
public:
ThrustStream(const int, int);
~ThrustStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -135,17 +135,20 @@ setup_aocc() {
setup_nvhpc() { setup_nvhpc() {
echo "Preparing Nvidia HPC SDK" echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz" local tarball="nvhpc.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" # local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
get_and_untar "$tarball" "$url" get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_215_Linux_x86_64_cuda_11.3/install_components/Linux_x86_64/21.5" local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
local bin_dir="$sdk_dir/compilers/bin" local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x "$bin_dir/makelocalrc" "$bin_dir" -x
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
export_var NVHPC_NVCXX "$bin_dir/nvc++" export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.3/bin/nvcc" export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.3"
echo "Installed CUDA versions:" echo "Installed CUDA versions:"
ls "$sdk_dir/cuda" ls "$sdk_dir/cuda"
verify_bin_exists "$NVHPC_NVCXX" verify_bin_exists "$NVHPC_NVCXX"
@ -249,10 +252,11 @@ setup_clang_gcc() {
setup_rocm() { setup_rocm() {
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq sudo apt-get update -qq
sudo apt-get install -y -qq rocm-dev sudo apt-get install -y -qq rocm-dev rocthrust-dev
export_var ROCM_PATH "/opt/rocm" export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc" export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
verify_bin_exists "$HIP_CXX" verify_bin_exists "$HIP_CXX"
"$HIP_CXX" --version "$HIP_CXX" --version

View File

@ -57,7 +57,7 @@ run_build() {
local cmake_code=$? local cmake_code=$?
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
local cmake_code=$? local cmake_code=$?
set -e set -e
@ -92,11 +92,11 @@ run_build() {
# GCC_CXX="/usr/bin/g++" # GCC_CXX="/usr/bin/g++"
# CLANG_CXX="/usr/bin/clang++" # CLANG_CXX="/usr/bin/clang++"
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" # NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" # NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++"
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" # NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" # NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.2"
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x # "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
# AOMP_CXX="/usr/lib/aomp/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++"
@ -124,7 +124,7 @@ run_build() {
AMD_ARCH="gfx_903" AMD_ARCH="gfx_903"
NV_ARCH="sm_70" NV_ARCH="sm_70"
NV_ARCH_CCXY="cuda11.3,cc80" NV_ARCH_CCXY="cuda11.4,cc80"
build_gcc() { build_gcc() {
local name="gcc_build" local name="gcc_build"
@ -171,6 +171,28 @@ build_gcc() {
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH" # -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
fi
} }
build_clang() { build_clang() {
@ -193,7 +215,7 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
@ -219,7 +241,11 @@ build_aomp() {
} }
build_hip() { build_hip() {
run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" local name="hip_build"
run_build $name "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${GCC_CXX:?}" THRUST "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
} }
build_icpx() { build_icpx() {

View File

@ -27,6 +27,8 @@
#include "STD20Stream.hpp" #include "STD20Stream.hpp"
#elif defined(TBB) #elif defined(TBB)
#include "TBBStream.hpp" #include "TBBStream.hpp"
#elif defined(THRUST)
#include "ThrustStream.h"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -272,6 +274,10 @@ void run()
// Use the C++20 implementation // Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex); stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(THRUST)
// Use the Thrust implementation
stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);