From 2eca3974e64c2517a6c35a8161d5574677dafc77 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 21 Apr 2021 16:28:12 +0100
Subject: [PATCH 01/14] Disable CI for RAJA on gcc-10+CUDA due to ICE Update
 changelog to include RAJA 0.13.x

---
 CHANGELOG.md       |  1 +
 ci-test-compile.sh | 14 +++++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19c2a6d..29702d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ All notable changes to this project will be documented in this file.
 - Unified run function in driver code to reduce code duplication, output should be uneffected.
 - Normalise sum result by expected value to help false negative errors.
 - HC version deprecated and moved to a legacy directory.
+- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).
 
 ### Removed
 - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
diff --git a/ci-test-compile.sh b/ci-test-compile.sh
index 46046c4..696f5cd 100755
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@@ -146,11 +146,15 @@ build_gcc() {
   run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
   run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
   run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
-  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-  -DENABLE_CUDA=ON \
-  -DTARGET=NVIDIA \
-  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-  -DCUDA_ARCH=$NV_ARCH"
+
+#  FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
+#  FIXME we also got https://github.com/NVIDIA/nccl/issues/494
+
+#  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
+#  -DENABLE_CUDA=ON \
+#  -DTARGET=NVIDIA \
+#  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
+#  -DCUDA_ARCH=$NV_ARCH"
 
 }
 

From 75a4394830d209614c3026ed4b20efd2bf6baffa Mon Sep 17 00:00:00 2001
From: Tobias Burnus <tobias@codesourcery.com>
Date: Mon, 19 Apr 2021 18:55:35 +0200
Subject: [PATCH 02/14] Include stdlib.h for aligned_alloc

Silence "error: there are no arguments to 'aligned_alloc' that depend
on a template parameter, so a declaration of 'aligned_alloc' must be
available"

* OMPStream.cpp: #include <cstdlib>.
* RAJAStream.cpp: Likewise.
---
 OMPStream.cpp  | 1 +
 RAJAStream.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/OMPStream.cpp b/OMPStream.cpp
index 8063987..0cd8035 100644
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
+#include <cstdlib>  // For aligned_alloc
 #include "OMPStream.h"
 
 #ifndef ALIGNMENT
diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 44db5ed..d271ea4 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
+#include <cstdlib>  // For aligned_alloc
 #include <stdexcept>
 #include "RAJAStream.hpp"
 

From cc16547e4defe8279ea8bf2f96b6f4f36bb61c7a Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Mon, 10 May 2021 17:50:36 +0100
Subject: [PATCH 03/14] Add install target for CMake

---
 CMakeLists.txt     |  2 ++
 ci-test-compile.sh | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17669a3..d4a11cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,3 +188,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
 if (COMMAND setup_target)
     setup_target(${EXE_NAME})
 endif ()
+
+install (TARGETS ${EXE_NAME} DESTINATION bin)
\ No newline at end of file
diff --git a/ci-test-compile.sh b/ci-test-compile.sh
index 696f5cd..1b5c1bb 100755
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@@ -44,21 +44,26 @@ run_build() {
 
   rm -rf "$build"
   set +e
+  local install_dir="$build/install"
 
   # shellcheck disable=SC2086
   "$CMAKE_BIN" -B"$build" -H. \
     -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_VERBOSE_MAKEFILE=ON \
+    -DCMAKE_INSTALL_PREFIX="$install_dir" \
     -DMODEL="$model" $flags &>>"$log"
   local model_lower=$(echo "$model" | awk '{print tolower($0)}')
 
   local cmake_code=$?
 
   "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
+  "$CMAKE_BIN" --build "$build" --target install  -j "$(nproc)" &>>"$log"
   local cmake_code=$?
   set -e
 
   local bin="./$build/$model_lower-stream"
+  local installed_bin="./$install_dir/bin/$model_lower-stream"
+
   echo "Checking for final executable: $bin"
   if [[ -f "$bin" ]]; then
     echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
@@ -66,6 +71,11 @@ run_build() {
     cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/    /'
     cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/    /'
     cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/    $(tput setaf 3)&$(tput sgr0)/"
+    if [[ ! -f "$installed_bin" ]]; then
+      echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)"
+      cat "$log"
+      exit 1
+    fi
   else
     echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
     echo "      $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"

From e20aecd845fd8736e2a7905f17483c104dd9651e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 17 May 2021 15:25:43 +0100
Subject: [PATCH 04/14] [SYCL 1.2.1] Add check for FP64 support

Fixes #98
---
 SYCLStream.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 49ad3ac..00c043f 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
 
+  // Check device can support FP64 if needed
+  if (sizeof(T) == sizeof(double))
+  {
+    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
+      throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+  }
+
   // Determine sensible dot kernel NDRange configuration
   if (dev.is_cpu())
   {

From 6581ee63b809f446a0e4ad3ec479e9b76e0c6591 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 17 May 2021 15:33:54 +0100
Subject: [PATCH 05/14] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 29702d7..3dbabed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
 - Support for CUDA Managed Memory and Page Fault memory.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
+- SYCL device check for FP64 support.
 
 ### Changed
 - Default branch renamed from `master` to `main`.

From 45ebd09ef27c4c8da2cc45c07e01070e53e1e07b Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Mon, 17 May 2021 20:00:00 +0100
Subject: [PATCH 06/14] Don't use hardcoded rocm path

---
 ci-prepare-bionic.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh
index d8ae312..290e87b 100755
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@@ -240,8 +240,7 @@ setup_rocm() {
   echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
   sudo apt-get update -qq
   sudo apt-get install -y -qq rocm-dev
-  # AMD needs this rocm_path thing exported...
-  export_var ROCM_PATH "/opt/rocm-4.1.0"
+  export_var ROCM_PATH "/opt/rocm"
   export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
   verify_bin_exists "$HIP_CXX"
   "$HIP_CXX" --version

From b772d00fe407bfb0499b05d79513bc7986dbff3a Mon Sep 17 00:00:00 2001
From: Tom Deakin <thomasdeakin@gmail.com>
Date: Tue, 18 May 2021 16:44:06 +0100
Subject: [PATCH 07/14] Revert "Add check for FP64 support"

---
 CHANGELOG.md   | 1 -
 SYCLStream.cpp | 8 --------
 2 files changed, 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3dbabed..29702d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,6 @@ All notable changes to this project will be documented in this file.
 - Support for CUDA Managed Memory and Page Fault memory.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
-- SYCL device check for FP64 support.
 
 ### Changed
 - Default branch renamed from `master` to `main`.
diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 00c043f..49ad3ac 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -28,14 +28,6 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
 
-  // Check device can support FP64 if needed
-  if (sizeof(T) == sizeof(double))
-  {
-    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
-      throw std::runtime_error("Device does not support double precision, please use --float");
-    }
-  }
-
   // Determine sensible dot kernel NDRange configuration
   if (dev.is_cpu())
   {

From 82dedad6766d12909edce2437cd20f17ca47d1f8 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 17 May 2021 15:25:43 +0100
Subject: [PATCH 08/14] [SYCL 1.2.1] Add check for FP64 support

Fixes #98
---
 SYCLStream.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 49ad3ac..00c043f 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
 
+  // Check device can support FP64 if needed
+  if (sizeof(T) == sizeof(double))
+  {
+    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
+      throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+  }
+
   // Determine sensible dot kernel NDRange configuration
   if (dev.is_cpu())
   {

From 2ab68ab39e8604f67f5f98ec89d125654fabd0b4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 17 May 2021 15:33:54 +0100
Subject: [PATCH 09/14] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 29702d7..3dbabed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
 - Support for CUDA Managed Memory and Page Fault memory.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
+- SYCL device check for FP64 support.
 
 ### Changed
 - Default branch renamed from `master` to `main`.

From 742f0629be70c143e05df4ecf73bcf71bba150a7 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Thu, 27 May 2021 09:28:40 +0100
Subject: [PATCH 10/14] Initial TBB implementation

---
 .gitignore           |   1 +
 CMakeLists.txt       |   1 +
 README.md            |   3 +-
 TBB.cmake            |  10 +++
 TBB.make             |  28 ++++++++
 TBBStream.cpp        | 157 +++++++++++++++++++++++++++++++++++++++++++
 TBBStream.hpp        |  56 +++++++++++++++
 ci-prepare-bionic.sh |  16 +++++
 ci-test-compile.sh   |   5 +-
 main.cpp             |   6 ++
 10 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 TBB.cmake
 create mode 100644 TBB.make
 create mode 100644 TBBStream.cpp
 create mode 100644 TBBStream.hpp

diff --git a/.gitignore b/.gitignore
index c3ea1da..31af301 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ kokkos-stream
 std-stream
 sycl-stream
 hip-stream
+tbb-stream
 
 *.o
 *.bc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4a11cd..797a9c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
 register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
+register_model(TBB TBB TBBStream.cpp)
 
 
 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
diff --git a/README.md b/README.md
index 8ca7398..68908a3 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Currently implemented are:
   - Kokkos
   - RAJA
   - SYCL
+  - TBB
 
 This code was previously called GPU-STREAM.
 
@@ -90,7 +91,7 @@ For example:
         Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) 
 -- CXX_EXTRA_LINKER_FLAGS: 
         Append to linker flags (i.e GCC's `-Wl` or equivalent)
--- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
+-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
 -- Selected model  :  OCL
 -- Supported flags:
 
diff --git a/TBB.cmake b/TBB.cmake
new file mode 100644
index 0000000..a92ea82
--- /dev/null
+++ b/TBB.cmake
@@ -0,0 +1,10 @@
+
+register_flag_required(TBB_DIR
+        "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/")
+
+macro(setup)
+    set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
+    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
+    find_package(TBB REQUIRED)
+    register_link_library(TBB::tbb)
+endmacro()
diff --git a/TBB.make b/TBB.make
new file mode 100644
index 0000000..e3b5c86
--- /dev/null
+++ b/TBB.make
@@ -0,0 +1,28 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+TBB_LIB=
+
+COMPILER_GNU = g++
+CXX = $(COMPILER_$(COMPILER))
+
+FLAGS_GNU = -O3 -std=c++14 -march=native
+CXXFLAGS = $(FLAGS_$(COMPILER))
+
+
+tbb-stream: main.cpp TBBStream.cpp
+	$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
+
+.PHONY: clean
+clean:
+	rm -f tbb-stream
+
diff --git a/TBBStream.cpp b/TBBStream.cpp
new file mode 100644
index 0000000..4201796
--- /dev/null
+++ b/TBBStream.cpp
@@ -0,0 +1,157 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "TBBStream.hpp"
+#include "oneapi/tbb.h"
+
+template <class T>
+TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
+ : partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+{
+  std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
+}
+
+template <class T>
+template <typename U, typename F>
+U TBBStream<T>::with_partitioner(const F &f) 
+{
+  switch(partitioner){
+    case Partitioner::Auto:      return f(tbb::auto_partitioner{});
+    case Partitioner::Affinity:  { tbb::affinity_partitioner p; return f(p); }  //  parallel_* doesn't take const affinity_partitioner here
+    case Partitioner::Static:    return f(tbb::static_partitioner{});
+    case Partitioner::Simple:    return f(tbb::simple_partitioner{});
+    default:                     throw std::runtime_error("Error asking for name for non-existant device");
+  }
+}
+
+template <class T>
+template <typename F>
+void TBBStream<T>::parallel_for(const F &f) 
+{
+  // using size_t as per the range type (also used in the official documentation)
+  with_partitioner<std::nullptr_t>([&](auto &&p) { 
+    tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+      for (size_t i = r.begin(); i < r.end(); ++i) { 
+        f(i);
+      }
+    }, p);
+    return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
+  });
+}
+
+template <class T>
+template <typename F, typename Op>
+T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f) 
+{
+  return with_partitioner<T>([&](auto &&p) {
+    return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
+      for (size_t i = r.begin(); i < r.end(); ++i) { 
+        acc = op(acc, f(i));
+      }
+      return acc;
+    }, op, p);
+  });
+}
+
+template <class T>
+void TBBStream<T>::init_arrays(T initA, T initB, T initC)
+{
+
+  parallel_for([&](size_t i){ 
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;  
+  });
+
+}
+
+template <class T>
+void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  // Element-wise copy.
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void TBBStream<T>::copy()
+{
+  parallel_for([&](size_t i){ c[i] = a[i]; });
+}
+
+template <class T>
+void TBBStream<T>::mul()
+{
+  const T scalar = startScalar;
+  
+  parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
+  
+}
+
+template <class T>
+void TBBStream<T>::add()
+{
+
+  parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
+
+}
+
+template <class T>
+void TBBStream<T>::triad()
+{
+  const T scalar = startScalar;
+
+  parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
+
+}
+
+template <class T>
+void TBBStream<T>::nstream()
+{
+  const T scalar = startScalar;
+
+  parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
+
+}
+
+template <class T>
+T TBBStream<T>::dot()
+{
+  // sum += a[i] * b[i];
+  return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
+}
+
+void listDevices(void)
+{
+  std::cout 
+    << "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n" 
+    << "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n" 
+    << "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n" 
+    << "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n" 
+    << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" 
+    << std::endl;
+}
+
+std::string getDeviceName(const int device)
+{
+  switch(static_cast<Partitioner>(device)){
+    case Partitioner::Auto:      return "auto_partitioner";
+    case Partitioner::Affinity:  return "affinity_partitioner";
+    case Partitioner::Static:    return "static_partitioner";
+    case Partitioner::Simple:    return "simple_partitioner";
+    default:                     throw std::runtime_error("Error asking for name for non-existant device");
+  }
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+template class TBBStream<float>;
+template class TBBStream<double>;
+
diff --git a/TBBStream.hpp b/TBBStream.hpp
new file mode 100644
index 0000000..dc87ea6
--- /dev/null
+++ b/TBBStream.hpp
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "oneapi/tbb.h"
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "TBB"
+
+enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
+
+template <class T>
+class TBBStream : public Stream<T>
+{
+  protected:
+  
+
+    Partitioner partitioner;
+    tbb::blocked_range<size_t> range;
+    // Device side pointers
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
+    
+
+    template < typename U, typename F>
+    U with_partitioner(const F &f);
+ 
+    template <typename F>
+    void parallel_for(const F &f);
+
+    template <typename F, typename Op>
+    T parallel_reduce(T init, const Op &op, const F &f);
+
+  public:
+    TBBStream(const int, int);
+    ~TBBStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh
index 290e87b..fb69c05 100755
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@@ -208,6 +208,20 @@ setup_raja() {
   check_size
 }
 
+setup_tbb() {
+  echo "Preparing TBB"
+  local tbb_ver="2021.2.0"
+  local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
+
+  local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
+  # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
+
+  get_and_untar "$tarball" "$url"
+  export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
+  verify_dir_exists "$TBB_LIB"
+  check_size
+}
+
 setup_clang_gcc() {
 
   echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
   setup_dpcpp &
   setup_kokkos &
   setup_raja &
+  setup_tbb &
   wait
 else
   setup_cmake
@@ -364,6 +379,7 @@ else
   setup_dpcpp
   setup_kokkos
   setup_raja
+  setup_tbb
   # these need apt
   setup_clang_gcc
   setup_rocm
diff --git a/ci-test-compile.sh b/ci-test-compile.sh
index 1b5c1bb..24a7091 100755
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@@ -112,6 +112,8 @@ run_build() {
 #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
 #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
 #
+#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
+#
 #GCC_STD_PAR_LIB="tbb"
 #CLANG_STD_PAR_LIB="tbb"
 #GCC_OMP_OFFLOAD_AMD=false
@@ -138,7 +140,7 @@ build_gcc() {
   # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
   run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
   run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
-
+  run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
   if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
     run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
     run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@@ -188,6 +190,7 @@ build_clang() {
   run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
   run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
   # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
   run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
   # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
diff --git a/main.cpp b/main.cpp
index e78d7a1..de301ce 100644
--- a/main.cpp
+++ b/main.cpp
@@ -25,6 +25,8 @@
 #include "STDStream.h"
 #elif defined(STD20)
 #include "STD20Stream.hpp"
+#elif defined(TBB)
+#include "TBBStream.hpp"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@@ -266,6 +268,10 @@ void run()
   // Use the C++20 implementation
   stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(TBB)
+  // Use the C++20 implementation
+  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
+
 #elif defined(ACC)
   // Use the OpenACC implementation
   stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);

From 7a130a59bc3c4b827cb1f5f95add54b0501110bd Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Thu, 27 May 2021 10:23:06 +0100
Subject: [PATCH 11/14] Don't tie implementation to oneTBB specific headers Fix
 wrong TBB_ROOT detection

---
 TBB.cmake          | 12 +++++--
 TBBStream.cpp      |  2 +-
 ci-test-compile.sh | 78 +++++++++++++++++++++++++---------------------
 3 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/TBB.cmake b/TBB.cmake
index a92ea82..99e31f7 100644
--- a/TBB.cmake
+++ b/TBB.cmake
@@ -1,8 +1,16 @@
 
-register_flag_required(TBB_DIR
-        "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/")
+register_flag_optional(ONE_TBB_DIR
+        "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
+         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
+        "")
 
 macro(setup)
+    if(ONE_TBB_DIR)
+        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
+        # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years
+    endif()
+    
+
     set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
     # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
     find_package(TBB REQUIRED)
diff --git a/TBBStream.cpp b/TBBStream.cpp
index 4201796..1d09927 100644
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@@ -5,7 +5,7 @@
 // source code
 
 #include "TBBStream.hpp"
-#include "oneapi/tbb.h"
+#include "tbb/tbb.h"
 
 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
diff --git a/ci-test-compile.sh b/ci-test-compile.sh
index 24a7091..456f836 100755
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@@ -86,40 +86,40 @@ run_build() {
 }
 
 ###
-#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
-#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
-#
-#GCC_CXX="/usr/bin/g++"
-#CLANG_CXX="/usr/bin/clang++"
-#
-#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
-#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
-#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
-#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
-#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
-#
-#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
-#AOMP_CXX="/usr/lib/aomp/bin/clang++"
-#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
-#
-## AMD needs this rocm_path thing exported...
-#export ROCM_PATH="/opt/rocm-4.0.0"
-#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
-#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
-#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
-#HIPSYCL_DIR="/opt/hipsycl/cff515c/"
-#
-#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
-#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
-#
-#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
-#
-#GCC_STD_PAR_LIB="tbb"
-#CLANG_STD_PAR_LIB="tbb"
-#GCC_OMP_OFFLOAD_AMD=false
-#GCC_OMP_OFFLOAD_NVIDIA=true
-#CLANG_OMP_OFFLOAD_AMD=false
-#CLANG_OMP_OFFLOAD_NVIDIA=false
+# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
+# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
+
+# GCC_CXX="/usr/bin/g++"
+# CLANG_CXX="/usr/bin/clang++"
+
+# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
+# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
+# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
+# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
+# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
+
+# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
+# AOMP_CXX="/usr/lib/aomp/bin/clang++"
+# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
+
+# # AMD needs this rocm_path thing exported...
+# export ROCM_PATH="/opt/rocm-4.0.0"
+# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
+# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
+# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
+# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
+
+# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
+# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
+
+# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
+
+# GCC_STD_PAR_LIB="tbb"
+# CLANG_STD_PAR_LIB="tbb"
+# GCC_OMP_OFFLOAD_AMD=false
+# GCC_OMP_OFFLOAD_NVIDIA=true
+# CLANG_OMP_OFFLOAD_AMD=false
+# CLANG_OMP_OFFLOAD_NVIDIA=false
 ###
 
 AMD_ARCH="gfx_903"
@@ -140,7 +140,10 @@ build_gcc() {
   # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
   run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
   run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
-  run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
+
+  run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
   if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
     run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
     run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@@ -190,7 +193,10 @@ build_clang() {
   run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
   run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
   # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
-  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
+  
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
   run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
   # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }

From 0867115d8dcc6d6c9b63b56dc3a124f230f626c5 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Thu, 27 May 2021 10:51:45 +0100
Subject: [PATCH 12/14] Remove references to oneapi/tbb.h

---
 TBBStream.cpp | 1 -
 TBBStream.hpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/TBBStream.cpp b/TBBStream.cpp
index 1d09927..08b83c8 100644
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@@ -5,7 +5,6 @@
 // source code
 
 #include "TBBStream.hpp"
-#include "tbb/tbb.h"
 
 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
diff --git a/TBBStream.hpp b/TBBStream.hpp
index dc87ea6..6ba9741 100644
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@@ -8,7 +8,7 @@
 
 #include <iostream>
 #include <vector>
-#include "oneapi/tbb.h"
+#include "tbb/tbb.h"
 #include "Stream.h"
 
 #define IMPLEMENTATION_STRING "TBB"

From 0e3727d8f853195cb6bc29307159749fa74d9947 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Thu, 3 Jun 2021 13:43:12 +0100
Subject: [PATCH 13/14] Make partitioner a compile option Inline all
 abstractions Add intel compilers for Make

---
 TBB.cmake     |  15 ++++++-
 TBB.make      |  40 ++++++++++++++---
 TBBStream.cpp | 116 ++++++++++++++++++++------------------------------
 TBBStream.hpp |  32 ++++++++------
 4 files changed, 113 insertions(+), 90 deletions(-)

diff --git a/TBB.cmake b/TBB.cmake
index 99e31f7..e4d6bac 100644
--- a/TBB.cmake
+++ b/TBB.cmake
@@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR
          If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
         "")
 
+
+register_flag_optional(PARTITIONER
+        "Partitioner specifies how a loop template should partition its work among threads.
+         Possible values are:
+            AUTO     - Optimize range subdivision based on work-stealing events.
+            AFFINITY - Proportional splitting that optimizes for cache affinity.
+            STATIC   - Distribute work uniformly with no additional load balancing.
+            SIMPLE   - Recursively split its range until it cannot be further subdivided.
+            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
+        "AUTO")
+
 macro(setup)
     if(ONE_TBB_DIR)
         set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
-        # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years
+        # docs on Intel's website refers to TBB_DIR which is not correct
     endif()
     
 
-    set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
     # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
     find_package(TBB REQUIRED)
     register_link_library(TBB::tbb)
+    register_definitions(PARTITIONER_${PARTITIONER})
 endmacro()
diff --git a/TBB.make b/TBB.make
index e3b5c86..c224a5a 100644
--- a/TBB.make
+++ b/TBB.make
@@ -3,24 +3,52 @@ ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
-  GNU
+  GNU INTEL INTEL_LEGACY
 
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 
-TBB_LIB=
 
-COMPILER_GNU = g++
+CXX_GNU          = g++
+CXX_INTEL        = icpx
+CXX_INTEL_LEGACY = icpc
 CXX = $(COMPILER_$(COMPILER))
 
-FLAGS_GNU = -O3 -std=c++14 -march=native
-CXXFLAGS = $(FLAGS_$(COMPILER))
+CXXFLAGS_GNU          = -march=native
+CXXFLAGS_INTEL        = -march=native
+CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
+
+
+
+ifndef PARTITIONER
+define partitioner_help
+Set PARTITIONER to select TBB's partitioner.
+Partitioner specifies how a loop template should partition its work among threads.
+
+Available options:
+  AUTO     - Optimize range subdivision based on work-stealing events.
+  AFFINITY - Proportional splitting that optimizes for cache affinity.
+  STATIC   - Distribute work uniformly with no additional load balancing.
+  SIMPLE   - Recursively split its range until it cannot be further subdivided.
+
+See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
+for more details.
+
+endef
+$(info $(partitioner_help))
+PARTITIONER=AUTO
+endif
+
+PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
 
 
 tbb-stream: main.cpp TBBStream.cpp
-	$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
+	$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
 
 .PHONY: clean
 clean:
diff --git a/TBBStream.cpp b/TBBStream.cpp
index 08b83c8..9c34a50 100644
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@@ -8,62 +8,26 @@
 
 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
- : partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+ : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 {
-  std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
-}
-
-template <class T>
-template <typename U, typename F>
-U TBBStream<T>::with_partitioner(const F &f) 
-{
-  switch(partitioner){
-    case Partitioner::Auto:      return f(tbb::auto_partitioner{});
-    case Partitioner::Affinity:  { tbb::affinity_partitioner p; return f(p); }  //  parallel_* doesn't take const affinity_partitioner here
-    case Partitioner::Static:    return f(tbb::static_partitioner{});
-    case Partitioner::Simple:    return f(tbb::simple_partitioner{});
-    default:                     throw std::runtime_error("Error asking for name for non-existant device");
+  if(device != 0){
+    throw std::runtime_error("Device != 0 is not supported by TBB");
   }
+  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
 }
 
-template <class T>
-template <typename F>
-void TBBStream<T>::parallel_for(const F &f) 
-{
-  // using size_t as per the range type (also used in the official documentation)
-  with_partitioner<std::nullptr_t>([&](auto &&p) { 
-    tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
-      for (size_t i = r.begin(); i < r.end(); ++i) { 
-        f(i);
-      }
-    }, p);
-    return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
-  });
-}
-
-template <class T>
-template <typename F, typename Op>
-T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f) 
-{
-  return with_partitioner<T>([&](auto &&p) {
-    return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
-      for (size_t i = r.begin(); i < r.end(); ++i) { 
-        acc = op(acc, f(i));
-      }
-      return acc;
-    }, op, p);
-  });
-}
 
 template <class T>
 void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 {
 
-  parallel_for([&](size_t i){ 
-    a[i] = initA;
-    b[i] = initB;
-    c[i] = initC;  
-  });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+      a[i] = initA;
+      b[i] = initB;
+      c[i] = initC;
+    }
+  }, partitioner);
 
 }
 
@@ -79,23 +43,35 @@ void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
 template <class T>
 void TBBStream<T>::copy()
 {
-  parallel_for([&](size_t i){ c[i] = a[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i];
+    }
+  }, partitioner);
 }
 
 template <class T>
 void TBBStream<T>::mul()
 {
   const T scalar = startScalar;
-  
-  parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
-  
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       b[i] = scalar * c[i];
+    }
+  }, partitioner);
+
 }
 
 template <class T>
 void TBBStream<T>::add()
 {
 
-  parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i] + b[i];
+    }
+  }, partitioner);
 
 }
 
@@ -104,7 +80,11 @@ void TBBStream<T>::triad()
 {
   const T scalar = startScalar;
 
-  parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] = b[i] + scalar * c[i];
+    }
+  }, partitioner);
 
 }
 
@@ -113,7 +93,11 @@ void TBBStream<T>::nstream()
 {
   const T scalar = startScalar;
 
-  parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] += b[i] + scalar * c[i];
+    }
+  }, partitioner);
 
 }
 
@@ -121,29 +105,23 @@ template <class T>
 T TBBStream<T>::dot()
 {
   // sum += a[i] * b[i];
-  return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
+  return
+    tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
+      for (size_t i = r.begin(); i < r.end(); ++i) {
+        acc += a[i] * b[i];
+      }
+      return acc;
+    }, std::plus<T>(), partitioner);
 }
 
 void listDevices(void)
 {
-  std::cout 
-    << "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n" 
-    << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" 
-    << std::endl;
+   std::cout << "Listing devices is not supported by TBB" << std::endl;
 }
 
 std::string getDeviceName(const int device)
 {
-  switch(static_cast<Partitioner>(device)){
-    case Partitioner::Auto:      return "auto_partitioner";
-    case Partitioner::Affinity:  return "affinity_partitioner";
-    case Partitioner::Static:    return "static_partitioner";
-    case Partitioner::Simple:    return "simple_partitioner";
-    default:                     throw std::runtime_error("Error asking for name for non-existant device");
-  }
+  return std::string("Device name unavailable");
 }
 
 std::string getDeviceDriver(const int)
diff --git a/TBBStream.hpp b/TBBStream.hpp
index 6ba9741..90763a9 100644
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@@ -13,31 +13,37 @@
 
 #define IMPLEMENTATION_STRING "TBB"
 
-enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
+#if defined(PARTITIONER_AUTO)
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#elif defined(PARTITIONER_AFFINITY)
+using tbb_partitioner = tbb::affinity_partitioner;
+#define PARTITIONER_NAME  "affinity_partitioner"
+#elif defined(PARTITIONER_STATIC)
+using tbb_partitioner = tbb::static_partitioner;
+#define PARTITIONER_NAME  "static_partitioner"
+#elif defined(PARTITIONER_SIMPLE)
+using tbb_partitioner = tbb::simple_partitioner;
+#define PARTITIONER_NAME  "simple_partitioner"
+#else
+// default to auto
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#endif
+
 
 template <class T>
 class TBBStream : public Stream<T>
 {
   protected:
   
-
-    Partitioner partitioner;
+    tbb_partitioner partitioner;
     tbb::blocked_range<size_t> range;
     // Device side pointers
     std::vector<T> a;
     std::vector<T> b;
     std::vector<T> c;
-    
-
-    template < typename U, typename F>
-    U with_partitioner(const F &f);
  
-    template <typename F>
-    void parallel_for(const F &f);
-
-    template <typename F, typename Op>
-    T parallel_reduce(T init, const Op &op, const F &f);
-
   public:
     TBBStream(const int, int);
     ~TBBStream() = default;

From 25e021caa351aba0d5bcf32cbf639d55370d41ee Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 3 Jun 2021 16:08:14 +0100
Subject: [PATCH 14/14] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3dbabed..5d209e2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
 - SYCL device check for FP64 support.
+- New implementation using TBB.
 
 ### Changed
 - Default branch renamed from `master` to `main`.