Merge branch 'main' into rust

2021-06-10 05:37:03 +01:00 · 2021-06-10 05:37:03 +01:00 · c70a5da45b
commit c70a5da45b
parent 2ff883f2f7 25e021caa3
14 changed files with 383 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ kokkos-stream
 std-stream
 sycl-stream
 hip-stream
+tbb-stream

 *.o
 *.bc
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,8 @@ All notable changes to this project will be documented in this file.
 - Support for CUDA Managed Memory and Page Fault memory.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
+- SYCL device check for FP64 support.
+- New implementation using TBB.

 ### Changed
 - Default branch renamed from `master` to `main`.
@ -29,6 +31,7 @@ All notable changes to this project will be documented in this file.
 - Unified run function in driver code to reduce code duplication, output should be uneffected.
 - Normalise sum result by expected value to help false negative errors.
 - HC version deprecated and moved to a legacy directory.
+- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).

 ### Removed
 - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
 register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
+register_model(TBB TBB TBBStream.cpp)


 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -188,3 +189,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
 if (COMMAND setup_target)
    setup_target(${EXE_NAME})
 endif ()
+
+install (TARGETS ${EXE_NAME} DESTINATION bin)
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code

+#include <cstdlib>  // For aligned_alloc
 #include "OMPStream.h"

 #ifndef ALIGNMENT
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code

+#include <cstdlib>  // For aligned_alloc
 #include <stdexcept>
 #include "RAJAStream.hpp"

--- a/README.md
+++ b/README.md
@ -19,6 +19,7 @@ Currently implemented are:
  - Kokkos
  - RAJA
  - SYCL
+  - TBB

 This code was previously called GPU-STREAM.

@ -90,7 +91,7 @@ For example:
        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) 
 -- CXX_EXTRA_LINKER_FLAGS: 
        Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
+-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
 -- Selected model  :  OCL
 -- Supported flags:

--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
    throw std::runtime_error("Invalid device index");
  device dev = devices[device_index];

+  // Check device can support FP64 if needed
+  if (sizeof(T) == sizeof(double))
+  {
+    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
+      throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+  }
+
  // Determine sensible dot kernel NDRange configuration
  if (dev.is_cpu())
  {
--- a/TBB.cmake
+++ b/TBB.cmake
@ -0,0 +1,29 @@
+
+register_flag_optional(ONE_TBB_DIR
+        "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
+         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
+        "")
+
+
+register_flag_optional(PARTITIONER
+        "Partitioner specifies how a loop template should partition its work among threads.
+         Possible values are:
+            AUTO     - Optimize range subdivision based on work-stealing events.
+            AFFINITY - Proportional splitting that optimizes for cache affinity.
+            STATIC   - Distribute work uniformly with no additional load balancing.
+            SIMPLE   - Recursively split its range until it cannot be further subdivided.
+            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
+        "AUTO")
+
+macro(setup)
+    if(ONE_TBB_DIR)
+        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
+        # docs on Intel's website refers to TBB_DIR which is not correct
+    endif()
+    
+
+    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
+    find_package(TBB REQUIRED)
+    register_link_library(TBB::tbb)
+    register_definitions(PARTITIONER_${PARTITIONER})
+endmacro()
--- a/TBB.make
+++ b/TBB.make
@ -0,0 +1,56 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL INTEL_LEGACY
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+
+CXX_GNU          = g++
+CXX_INTEL        = icpx
+CXX_INTEL_LEGACY = icpc
+CXX = $(COMPILER_$(COMPILER))
+
+CXXFLAGS_GNU          = -march=native
+CXXFLAGS_INTEL        = -march=native
+CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
+
+
+
+ifndef PARTITIONER
+define partitioner_help
+Set PARTITIONER to select TBB's partitioner.
+Partitioner specifies how a loop template should partition its work among threads.
+
+Available options:
+  AUTO     - Optimize range subdivision based on work-stealing events.
+  AFFINITY - Proportional splitting that optimizes for cache affinity.
+  STATIC   - Distribute work uniformly with no additional load balancing.
+  SIMPLE   - Recursively split its range until it cannot be further subdivided.
+
+See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
+for more details.
+
+endef
+$(info $(partitioner_help))
+PARTITIONER=AUTO
+endif
+
+PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
+
+
+tbb-stream: main.cpp TBBStream.cpp
+	$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
+
+.PHONY: clean
+clean:
+	rm -f tbb-stream
+
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@ -0,0 +1,134 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "TBBStream.hpp"
+
+template <class T>
+TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
+ : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+{
+  if(device != 0){
+    throw std::runtime_error("Device != 0 is not supported by TBB");
+  }
+  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
+}
+
+
+template <class T>
+void TBBStream<T>::init_arrays(T initA, T initB, T initC)
+{
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+      a[i] = initA;
+      b[i] = initB;
+      c[i] = initC;
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  // Element-wise copy.
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void TBBStream<T>::copy()
+{
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i];
+    }
+  }, partitioner);
+}
+
+template <class T>
+void TBBStream<T>::mul()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       b[i] = scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::add()
+{
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i] + b[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::triad()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] = b[i] + scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::nstream()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] += b[i] + scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+T TBBStream<T>::dot()
+{
+  // sum += a[i] * b[i];
+  return
+    tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
+      for (size_t i = r.begin(); i < r.end(); ++i) {
+        acc += a[i] * b[i];
+      }
+      return acc;
+    }, std::plus<T>(), partitioner);
+}
+
+void listDevices(void)
+{
+   std::cout << "Listing devices is not supported by TBB" << std::endl;
+}
+
+std::string getDeviceName(const int device)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+template class TBBStream<float>;
+template class TBBStream<double>;
+
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@ -0,0 +1,62 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "tbb/tbb.h"
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "TBB"
+
+#if defined(PARTITIONER_AUTO)
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#elif defined(PARTITIONER_AFFINITY)
+using tbb_partitioner = tbb::affinity_partitioner;
+#define PARTITIONER_NAME  "affinity_partitioner"
+#elif defined(PARTITIONER_STATIC)
+using tbb_partitioner = tbb::static_partitioner;
+#define PARTITIONER_NAME  "static_partitioner"
+#elif defined(PARTITIONER_SIMPLE)
+using tbb_partitioner = tbb::simple_partitioner;
+#define PARTITIONER_NAME  "simple_partitioner"
+#else
+// default to auto
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#endif
+
+
+template <class T>
+class TBBStream : public Stream<T>
+{
+  protected:
+  
+    tbb_partitioner partitioner;
+    tbb::blocked_range<size_t> range;
+    // Device side pointers
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
+ 
+  public:
+    TBBStream(const int, int);
+    ~TBBStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@ -208,6 +208,20 @@ setup_raja() {
  check_size
 }

+setup_tbb() {
+  echo "Preparing TBB"
+  local tbb_ver="2021.2.0"
+  local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
+
+  local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
+  # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
+
+  get_and_untar "$tarball" "$url"
+  export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
+  verify_dir_exists "$TBB_LIB"
+  check_size
+}
+
 setup_clang_gcc() {

  echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -240,8 +254,7 @@ setup_rocm() {
  echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
  sudo apt-get update -qq
  sudo apt-get install -y -qq rocm-dev
-  # AMD needs this rocm_path thing exported...
-  export_var ROCM_PATH "/opt/rocm-4.1.0"
+  export_var ROCM_PATH "/opt/rocm"
  export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
  verify_bin_exists "$HIP_CXX"
  "$HIP_CXX" --version
@ -355,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
  setup_dpcpp &
  setup_kokkos &
  setup_raja &
+  setup_tbb &
  wait
 else
  setup_cmake
@ -365,6 +379,7 @@ else
  setup_dpcpp
  setup_kokkos
  setup_raja
+  setup_tbb
  # these need apt
  setup_clang_gcc
  setup_rocm
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@ -44,21 +44,26 @@ run_build() {

  rm -rf "$build"
  set +e
+  local install_dir="$build/install"

  # shellcheck disable=SC2086
  "$CMAKE_BIN" -B"$build" -H. \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_VERBOSE_MAKEFILE=ON \
+    -DCMAKE_INSTALL_PREFIX="$install_dir" \
    -DMODEL="$model" $flags &>>"$log"
  local model_lower=$(echo "$model" | awk '{print tolower($0)}')

  local cmake_code=$?

  "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
+  "$CMAKE_BIN" --build "$build" --target install  -j "$(nproc)" &>>"$log"
  local cmake_code=$?
  set -e

  local bin="./$build/$model_lower-stream"
+  local installed_bin="./$install_dir/bin/$model_lower-stream"
+
  echo "Checking for final executable: $bin"
  if [[ -f "$bin" ]]; then
    echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
@ -66,6 +71,11 @@ run_build() {
    cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/    $(tput setaf 3)&$(tput sgr0)/"
+    if [[ ! -f "$installed_bin" ]]; then
+      echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)"
+      cat "$log"
+      exit 1
+    fi
  else
    echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
    echo "      $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
@ -76,38 +86,40 @@ run_build() {
 }

 ###
-#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
-#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
-#
-#GCC_CXX="/usr/bin/g++"
-#CLANG_CXX="/usr/bin/clang++"
-#
-#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
-#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
-#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
-#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
-#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
-#
-#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
-#AOMP_CXX="/usr/lib/aomp/bin/clang++"
-#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
-#
-## AMD needs this rocm_path thing exported...
-#export ROCM_PATH="/opt/rocm-4.0.0"
-#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
-#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
-#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
-#HIPSYCL_DIR="/opt/hipsycl/cff515c/"
-#
-#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
-#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
-#
-#GCC_STD_PAR_LIB="tbb"
-#CLANG_STD_PAR_LIB="tbb"
-#GCC_OMP_OFFLOAD_AMD=false
-#GCC_OMP_OFFLOAD_NVIDIA=true
-#CLANG_OMP_OFFLOAD_AMD=false
-#CLANG_OMP_OFFLOAD_NVIDIA=false
+# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
+# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
+
+# GCC_CXX="/usr/bin/g++"
+# CLANG_CXX="/usr/bin/clang++"
+
+# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
+# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
+# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
+# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
+# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
+
+# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
+# AOMP_CXX="/usr/lib/aomp/bin/clang++"
+# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
+
+# # AMD needs this rocm_path thing exported...
+# export ROCM_PATH="/opt/rocm-4.0.0"
+# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
+# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
+# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
+# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
+
+# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
+# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
+
+# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
+
+# GCC_STD_PAR_LIB="tbb"
+# CLANG_STD_PAR_LIB="tbb"
+# GCC_OMP_OFFLOAD_AMD=false
+# GCC_OMP_OFFLOAD_NVIDIA=true
+# CLANG_OMP_OFFLOAD_AMD=false
+# CLANG_OMP_OFFLOAD_NVIDIA=false
 ###

 AMD_ARCH="gfx_903"
@ -129,6 +141,9 @@ build_gcc() {
  run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"

+  run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
  if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
    run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
    run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -146,11 +161,15 @@ build_gcc() {
  run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
-  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-  -DENABLE_CUDA=ON \
-  -DTARGET=NVIDIA \
-  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-  -DCUDA_ARCH=$NV_ARCH"
+
+#  FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
+#  FIXME we also got https://github.com/NVIDIA/nccl/issues/494
+
+#  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
+#  -DENABLE_CUDA=ON \
+#  -DTARGET=NVIDIA \
+#  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
+#  -DCUDA_ARCH=$NV_ARCH"

 }

@ -174,6 +193,10 @@ build_clang() {
  run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
  # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
+  
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
  run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
--- a/main.cpp
+++ b/main.cpp
@ -25,6 +25,8 @@
 #include "STDStream.h"
 #elif defined(STD20)
 #include "STD20Stream.hpp"
+#elif defined(TBB)
+#include "TBBStream.hpp"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -266,6 +268,10 @@ void run()
  // Use the C++20 implementation
  stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);

+#elif defined(TBB)
+  // Use the C++20 implementation
+  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
+
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);