diff --git a/.gitignore b/.gitignore index c3ea1da..31af301 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ kokkos-stream std-stream sycl-stream hip-stream +tbb-stream *.o *.bc diff --git a/CHANGELOG.md b/CHANGELOG.md index 19c2a6d..5d209e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ All notable changes to this project will be documented in this file. - Support for CUDA Managed Memory and Page Fault memory. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. +- SYCL device check for FP64 support. +- New implementation using TBB. ### Changed - Default branch renamed from `master` to `main`. @@ -29,6 +31,7 @@ All notable changes to this project will be documented in this file. - Unified run function in driver code to reduce code duplication, output should be uneffected. - Normalise sum result by expected value to help false negative errors. - HC version deprecated and moved to a legacy directory. +- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update). ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/CMakeLists.txt b/CMakeLists.txt index 17669a3..797a9c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp) register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) +register_model(TBB TBB TBBStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") @@ -188,3 +189,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) if (COMMAND setup_target) setup_target(${EXE_NAME}) endif () + +install (TARGETS ${EXE_NAME} DESTINATION bin) \ No newline at end of file diff --git a/OMPStream.cpp b/OMPStream.cpp index 8063987..0cd8035 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -5,6 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code +#include // For aligned_alloc #include "OMPStream.h" #ifndef ALIGNMENT diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 44db5ed..d271ea4 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -5,6 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code +#include // For aligned_alloc #include #include "RAJAStream.hpp" diff --git a/README.md b/README.md index 553f6e2..bb993f2 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - TBB This code was previously called GPU-STREAM. @@ -90,7 +91,7 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA +-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB -- Selected model : OCL -- Supported flags: diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 49ad3ac..00c043f 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -28,6 +28,14 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (dev.get_info().size() == 0) { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + // Determine sensible dot kernel NDRange configuration if (dev.is_cpu()) { diff --git a/TBB.cmake b/TBB.cmake new file mode 100644 index 0000000..e4d6bac --- /dev/null +++ b/TBB.cmake @@ -0,0 +1,29 @@ + +register_flag_optional(ONE_TBB_DIR + "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + "") + + +register_flag_optional(PARTITIONER + "Partitioner specifies how a loop template should partition its work among threads. + Possible values are: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." + "AUTO") + +macro(setup) + if(ONE_TBB_DIR) + set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 + # docs on Intel's website refers to TBB_DIR which is not correct + endif() + + + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages + find_package(TBB REQUIRED) + register_link_library(TBB::tbb) + register_definitions(PARTITIONER_${PARTITIONER}) +endmacro() diff --git a/TBB.make b/TBB.make new file mode 100644 index 0000000..c224a5a --- /dev/null +++ b/TBB.make @@ -0,0 +1,56 @@ + +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU INTEL INTEL_LEGACY + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + + +CXX_GNU = g++ +CXX_INTEL = icpx +CXX_INTEL_LEGACY = icpc +CXX = $(COMPILER_$(COMPILER)) + +CXXFLAGS_GNU = -march=native +CXXFLAGS_INTEL = -march=native +CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) + + + +ifndef PARTITIONER +define partitioner_help +Set PARTITIONER to select TBB's partitioner. +Partitioner specifies how a loop template should partition its work among threads. + +Available options: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + +See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners +for more details. + +endef +$(info $(partitioner_help)) +PARTITIONER=AUTO +endif + +PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) + + +tbb-stream: main.cpp TBBStream.cpp + $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + +.PHONY: clean +clean: + rm -f tbb-stream + diff --git a/TBBStream.cpp b/TBBStream.cpp new file mode 100644 index 0000000..9c34a50 --- /dev/null +++ b/TBBStream.cpp @@ -0,0 +1,134 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "TBBStream.hpp" + +template +TBBStream::TBBStream(const int ARRAY_SIZE, int device) + : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +{ + if(device != 0){ + throw std::runtime_error("Device != 0 is not supported by TBB"); + } + std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; +} + + +template +void TBBStream::init_arrays(T initA, T initB, T initC) +{ + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + }, partitioner); + +} + +template +void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + // Element-wise copy. + h_a = a; + h_b = b; + h_c = c; +} + +template +void TBBStream::copy() +{ + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i]; + } + }, partitioner); +} + +template +void TBBStream::mul() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + b[i] = scalar * c[i]; + } + }, partitioner); + +} + +template +void TBBStream::add() +{ + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i] + b[i]; + } + }, partitioner); + +} + +template +void TBBStream::triad() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = b[i] + scalar * c[i]; + } + }, partitioner); + +} + +template +void TBBStream::nstream() +{ + const T scalar = startScalar; + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] += b[i] + scalar * c[i]; + } + }, partitioner); + +} + +template +T TBBStream::dot() +{ + // sum += a[i] * b[i]; + return + tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc += a[i] * b[i]; + } + return acc; + }, std::plus(), partitioner); +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by TBB" << std::endl; +} + +std::string getDeviceName(const int device) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class TBBStream; +template class TBBStream; + diff --git a/TBBStream.hpp b/TBBStream.hpp new file mode 100644 index 0000000..90763a9 --- /dev/null +++ b/TBBStream.hpp @@ -0,0 +1,62 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "tbb/tbb.h" +#include "Stream.h" + +#define IMPLEMENTATION_STRING "TBB" + +#if defined(PARTITIONER_AUTO) +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#elif defined(PARTITIONER_AFFINITY) +using tbb_partitioner = tbb::affinity_partitioner; +#define PARTITIONER_NAME "affinity_partitioner" +#elif defined(PARTITIONER_STATIC) +using tbb_partitioner = tbb::static_partitioner; +#define PARTITIONER_NAME "static_partitioner" +#elif defined(PARTITIONER_SIMPLE) +using tbb_partitioner = tbb::simple_partitioner; +#define PARTITIONER_NAME "simple_partitioner" +#else +// default to auto +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#endif + + +template +class TBBStream : public Stream +{ + protected: + + tbb_partitioner partitioner; + tbb::blocked_range range; + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + public: + TBBStream(const int, int); + ~TBBStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index d8ae312..fb69c05 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -208,6 +208,20 @@ setup_raja() { check_size } +setup_tbb() { + echo "Preparing TBB" + local tbb_ver="2021.2.0" + local tarball="oneapi-tbb-$tbb_ver-lin.tgz" + + local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" + # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz" + + get_and_untar "$tarball" "$url" + export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver" + verify_dir_exists "$TBB_LIB" + check_size +} + setup_clang_gcc() { echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list @@ -240,8 +254,7 @@ setup_rocm() { echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update -qq sudo apt-get install -y -qq rocm-dev - # AMD needs this rocm_path thing exported... - export_var ROCM_PATH "/opt/rocm-4.1.0" + export_var ROCM_PATH "/opt/rocm" export_var HIP_CXX "$ROCM_PATH/bin/hipcc" verify_bin_exists "$HIP_CXX" "$HIP_CXX" --version @@ -355,6 +368,7 @@ if [ "$PARALLEL" = true ]; then setup_dpcpp & setup_kokkos & setup_raja & + setup_tbb & wait else setup_cmake @@ -365,6 +379,7 @@ else setup_dpcpp setup_kokkos setup_raja + setup_tbb # these need apt setup_clang_gcc setup_rocm diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 46046c4..456f836 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -44,21 +44,26 @@ run_build() { rm -rf "$build" set +e + local install_dir="$build/install" # shellcheck disable=SC2086 "$CMAKE_BIN" -B"$build" -H. \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_INSTALL_PREFIX="$install_dir" \ -DMODEL="$model" $flags &>>"$log" local model_lower=$(echo "$model" | awk '{print tolower($0)}') local cmake_code=$? "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" + "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" local cmake_code=$? set -e local bin="./$build/$model_lower-stream" + local installed_bin="./$install_dir/bin/$model_lower-stream" + echo "Checking for final executable: $bin" if [[ -f "$bin" ]]; then echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" @@ -66,6 +71,11 @@ run_build() { cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/" + if [[ ! -f "$installed_bin" ]]; then + echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)" + cat "$log" + exit 1 + fi else echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)" @@ -76,38 +86,40 @@ run_build() { } ### -#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" -#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" -# -#GCC_CXX="/usr/bin/g++" -#CLANG_CXX="/usr/bin/clang++" -# -#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x -# -#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" -#AOMP_CXX="/usr/lib/aomp/bin/clang++" -#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" -# -## AMD needs this rocm_path thing exported... -#export ROCM_PATH="/opt/rocm-4.0.0" -#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" -#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" -#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" -#HIPSYCL_DIR="/opt/hipsycl/cff515c/" -# -#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" -#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" -# -#GCC_STD_PAR_LIB="tbb" -#CLANG_STD_PAR_LIB="tbb" -#GCC_OMP_OFFLOAD_AMD=false -#GCC_OMP_OFFLOAD_NVIDIA=true -#CLANG_OMP_OFFLOAD_AMD=false -#CLANG_OMP_OFFLOAD_NVIDIA=false +# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" +# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" + +# GCC_CXX="/usr/bin/g++" +# CLANG_CXX="/usr/bin/clang++" + +# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" +# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" +# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" +# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" +# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x + +# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" +# AOMP_CXX="/usr/lib/aomp/bin/clang++" +# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" + +# # AMD needs this rocm_path thing exported... +# export ROCM_PATH="/opt/rocm-4.0.0" +# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" +# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" +# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" +# HIPSYCL_DIR="/opt/hipsycl/cff515c/" + +# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" +# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" + +# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" + +# GCC_STD_PAR_LIB="tbb" +# CLANG_STD_PAR_LIB="tbb" +# GCC_OMP_OFFLOAD_AMD=false +# GCC_OMP_OFFLOAD_NVIDIA=true +# CLANG_OMP_OFFLOAD_AMD=false +# CLANG_OMP_OFFLOAD_NVIDIA=false ### AMD_ARCH="gfx_903" @@ -129,6 +141,9 @@ build_gcc() { run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -146,11 +161,15 @@ build_gcc() { run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ - -DENABLE_CUDA=ON \ - -DTARGET=NVIDIA \ - -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ - -DCUDA_ARCH=$NV_ARCH" + +# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 +# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 + +# run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ +# -DENABLE_CUDA=ON \ +# -DTARGET=NVIDIA \ +# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ +# -DCUDA_ARCH=$NV_ARCH" } @@ -174,6 +193,10 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } diff --git a/main.cpp b/main.cpp index e78d7a1..de301ce 100644 --- a/main.cpp +++ b/main.cpp @@ -25,6 +25,8 @@ #include "STDStream.h" #elif defined(STD20) #include "STD20Stream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -266,6 +268,10 @@ void run() // Use the C++20 implementation stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(TBB) + // Use the C++20 implementation + stream = new TBBStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex);