From 6c47b22cc3e6731792364fba8f913e3ef898e44c Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Fri, 12 Nov 2021 12:38:54 +0000
Subject: [PATCH 1/9] Capture vectors by reference directly Add custom range
 implementation

---
 STDStream.cpp | 45 +++++++++++++++++++++++----------------------
 STDStream.h   | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 26 deletions(-)
diff --git a/STDStream.cpp b/STDStream.cpp
index 30ad420..536fd1c 100644
--- a/STDStream.cpp
+++ b/STDStream.cpp
@@ -14,62 +14,61 @@
 // auto exe_policy = std::execution::par;
 auto exe_policy = std::execution::par_unseq;
 
-template <class T>
-STDStream<T>::STDStream(const int ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE}, a{new T[array_size]}, b{new T[array_size]}, c{new T[array_size]}
-{
-}
 
 template <class T>
-STDStream<T>::~STDStream()
+STDStream<T>::STDStream(const int ARRAY_SIZE, int device)
+  noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) 
 {
-  delete[] a;
-  delete[] b;
-  delete[] c;
 }
 
 template <class T>
 void STDStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::fill(exe_policy, a, a+array_size, initA);
-  std::fill(exe_policy, b, b+array_size, initB);
-  std::fill(exe_policy, c, c+array_size, initC);
+  std::fill(exe_policy, a.begin(), a.end(), initA);
+  std::fill(exe_policy, b.begin(), b.end(), initB);
+  std::fill(exe_policy, c.begin(), c.end(), initC);
 }
 
 template <class T>
 void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  std::copy(exe_policy, a, a+array_size, h_a.data());
-  std::copy(exe_policy, b, b+array_size, h_b.data());
-  std::copy(exe_policy, c, c+array_size, h_c.data());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
 void STDStream<T>::copy()
 {
   // c[i] = a[i]
-  std::copy(exe_policy, a, a+array_size, c) ;
+  std::copy(exe_policy, a.begin(), a.end(), c.begin());
 }
 
 template <class T>
 void STDStream<T>::mul()
 {
   //  b[i] = scalar * c[i];
-  std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
+  std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
+    return scalar * c[i];
+  });
 }
 
 template <class T>
 void STDStream<T>::add()
 {
   //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
+  std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
+    return a[i] + b[i];
+  });
 }
 
 template <class T>
 void STDStream<T>::triad()
 {
   //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
+  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+    return b[i] + scalar * c[i];
+  });
 }
 
 template <class T>
@@ -79,15 +78,17 @@ void STDStream<T>::nstream()
   //  Need to do in two stages with C++11 STL.
   //  1: a[i] += b[i]
   //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, a, a+array_size, b, a, [](T ai, T bi){ return ai + bi; });
-  std::transform(exe_policy, a, a+array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; });
+  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+    return a[i] + b[i] + scalar * c[i];
+  });
 }
+   
 
 template <class T>
 T STDStream<T>::dot()
 {
   // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
+  return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
 }
 
 void listDevices(void)
diff --git a/STDStream.h b/STDStream.h
index 2249812..b50bad1 100644
--- a/STDStream.h
+++ b/STDStream.h
@@ -11,6 +11,33 @@
 
 #define IMPLEMENTATION_STRING "STD"
 
+template <typename N = size_t>
+class ranged {
+  N from, to; 
+public:
+  ranged(N from, N to ): from(from), to(to) {}
+    class iterator {
+      N num;
+    public:
+      using difference_type = N;
+      using value_type = N;
+      using pointer = const N*;
+      using reference = N&;
+      using iterator_category = std::random_access_iterator_tag;
+      explicit iterator(N _num = 0) : num(_num) {}
+      iterator& operator++() { num++; return *this; }
+      iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
+      bool operator==(iterator other) const { return num == other.num; }
+      bool operator!=(iterator other) const { return *this != other; }
+      reference operator*() const { return num;}
+      difference_type operator-(const iterator &it) const { return num - it.num; }
+      value_type operator[](const difference_type &i) const { return num+i; }
+
+    };
+    iterator begin() { return iterator(from); }
+    iterator end() { return iterator(to >= from? to+1 : to-1); }
+};
+
 template <class T>
 class STDStream : public Stream<T>
 {
@@ -18,14 +45,18 @@ class STDStream : public Stream<T>
     // Size of arrays
     int array_size;
 
+    // induction range
+    ranged<int> range;
+
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
+
 
   public:
     STDStream(const int, int) noexcept;
-    ~STDStream();
+    ~STDStream() = default;
 
     virtual void copy() override;
     virtual void add() override;

From f9bba3c0de0035db88f0b6557435895938915af0 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 15 Dec 2021 21:38:56 +0000
Subject: [PATCH 2/9] Split implementation for index/data oriented std variants
 Fix missing range iterator functions for std-indices Rename std20 to
 std-ranges

---
 CMakeLists.txt                                |  11 +-
 src/ci-test-compile.sh                        |  16 ++-
 src/main.cpp                                  |  26 +++--
 src/std-data/STDDataStream.cpp                | 103 ++++++++++++++++++
 src/std-data/STDDataStream.h                  |  42 +++++++
 src/{std => std-data}/model.cmake             |   0
 .../STDIndicesStream.cpp}                     |  24 ++--
 .../STDIndicesStream.h}                       |  21 ++--
 src/std-indices/model.cmake                   |  33 ++++++
 .../STDRangesStream.cpp}                      |  24 ++--
 .../STDRangesStream.hpp}                      |   8 +-
 src/{std20 => std-ranges}/model.cmake         |   0
 12 files changed, 251 insertions(+), 57 deletions(-)
 create mode 100644 src/std-data/STDDataStream.cpp
 create mode 100644 src/std-data/STDDataStream.h
 rename src/{std => std-data}/model.cmake (100%)
 rename src/{std/STDStream.cpp => std-indices/STDIndicesStream.cpp} (79%)
 rename src/{std/STDStream.h => std-indices/STDIndicesStream.h} (80%)
 create mode 100644 src/std-indices/model.cmake
 rename src/{std20/STD20Stream.cpp => std-ranges/STDRangesStream.cpp} (79%)
 rename src/{std20/STD20Stream.hpp => std-ranges/STDRangesStream.hpp} (83%)
 rename src/{std20 => std-ranges}/model.cmake (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad12dbc..d47dcad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,9 +34,9 @@ endmacro()
 #set(CUDA_ARCH sm_70)
 #set(BLT_DIR /home/tom/Downloads/blt-0.3.6/)
 
-#set(MODEL STD)
-#set(ARCH  cc70)
-#set(CXX_EXTRA_FLAGS -v)
+#set(MODEL std-data)
+#set(CMAKE_CXX_COMPILER /home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/compilers/bin/nvc++)
+#set(NVHPC_OFFLOAD "cuda11.4,cc61")
 
 #set(MODEL CUDA)
 #set(ARCH  sm_70)
@@ -117,8 +117,9 @@ include(cmake/register_models.cmake)
 # register out models <model_name> <preprocessor_def_name> <source files...>
 register_model(omp OMP OMPStream.cpp)
 register_model(ocl OCL OCLStream.cpp)
-register_model(std STD STDStream.cpp)
-register_model(std20 STD20 STD20Stream.cpp)
+register_model(std-data STD_DATA STDDataStream.cpp)
+register_model(std-indices STD_INDICES STDIndicesStream.cpp)
+register_model(std-ranges STD_RANGES STDRangesStream.cpp)
 register_model(hip HIP HIPStream.cpp)
 register_model(cuda CUDA CUDAStream.cu)
 register_model(kokkos KOKKOS KokkosStream.cpp)
diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh
index 3f54aaf..9388643 100755
--- a/src/ci-test-compile.sh
+++ b/src/ci-test-compile.sh
@@ -115,7 +115,7 @@ run_build() {
 # GCC_STD_PAR_LIB="tbb"
 # CLANG_STD_PAR_LIB="tbb"
 # GCC_OMP_OFFLOAD_AMD=false
-# GCC_OMP_OFFLOAD_NVIDIA=true
+# GCC_OMP_OFFLOAD_NVIDIA=false
 # CLANG_OMP_OFFLOAD_AMD=false
 # CLANG_OMP_OFFLOAD_NVIDIA=false
 ###
@@ -136,8 +136,9 @@ build_gcc() {
   fi
 
   # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
-  run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
-  run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
+  run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
+  run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
+  run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
 
   run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
   run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
@@ -211,7 +212,8 @@ build_clang() {
   run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
   run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
   run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
-  run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
+  run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
+  run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
   # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
   run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
   run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
@@ -219,7 +221,8 @@ build_clang() {
   run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
   run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
   run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
-  run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
+  run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
+  run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
   # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
 
   run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@@ -232,7 +235,8 @@ build_clang() {
 build_nvhpc() {
   local name="nvhpc_build"
   local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
-  run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
+  run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
+  run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
 }
diff --git a/src/main.cpp b/src/main.cpp
index 5a01b74..13a0021 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -21,10 +21,12 @@
 
 #if defined(CUDA)
 #include "CUDAStream.h"
-#elif defined(STD)
-#include "STDStream.h"
-#elif defined(STD20)
-#include "STD20Stream.hpp"
+#elif defined(STD_DATA)
+#include "STDDataStream.h"
+#elif defined(STD_INDICES)
+#include "STDIndicesStream.h"
+#elif defined(STD_RANGES)
+#include "STDRangesStream.hpp"
 #elif defined(TBB)
 #include "TBBStream.hpp"
 #elif defined(THRUST)
@@ -264,13 +266,17 @@ void run()
   // Use the Kokkos implementation
   stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
 
-#elif defined(STD)
-  // Use the STD implementation
-  stream = new STDStream<T>(ARRAY_SIZE, deviceIndex);
+#elif defined(STD_DATA)
+  // Use the C++ STD data-oriented implementation
+  stream = new STDDataStream<T>(ARRAY_SIZE, deviceIndex);
 
-#elif defined(STD20)
-  // Use the C++20 implementation
-  stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
+#elif defined(STD_INDICES)
+  // Use the C++ STD index-oriented implementation
+  stream = new STDIndicesStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(STD_RANGES)
+  // Use the C++ STD ranges implementation
+  stream = new STDRangesStream<T>(ARRAY_SIZE, deviceIndex);
 
 #elif defined(TBB)
   // Use the C++20 implementation
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
new file mode 100644
index 0000000..64a37a4
--- /dev/null
+++ b/src/std-data/STDDataStream.cpp
@@ -0,0 +1,103 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "STDDataStream.h"
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+
+// There are three execution policies:
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+auto exe_policy = std::execution::par_unseq;
+
+
+template <class T>
+STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
+  noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
+{
+}
+
+template <class T>
+void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  std::fill(exe_policy, a.begin(), a.end(), initA);
+  std::fill(exe_policy, b.begin(), b.end(), initB);
+  std::fill(exe_policy, c.begin(), c.end(), initC);
+}
+
+template <class T>
+void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void STDDataStream<T>::copy()
+{
+  // c[i] = a[i]
+  std::copy(exe_policy, a.begin(), a.end(), c.begin());
+}
+
+template <class T>
+void STDDataStream<T>::mul()
+{
+  //  b[i] = scalar * c[i];
+  std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; });
+}
+
+template <class T>
+void STDDataStream<T>::add()
+{
+  //  c[i] = a[i] + b[i];
+  std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>());
+}
+
+template <class T>
+void STDDataStream<T>::triad()
+{
+  //  a[i] = b[i] + scalar * c[i];
+  std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
+}
+
+template <class T>
+void STDDataStream<T>::nstream()
+{
+  //  a[i] += b[i] + scalar * c[i];
+  //  Need to do in two stages with C++11 STL.
+  //  1: a[i] += b[i]
+  //  2: a[i] += scalar * c[i];
+  std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; });
+  std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
+}
+   
+
+template <class T>
+T STDDataStream<T>::dot()
+{
+  // sum = 0; sum += a[i]*b[i]; return sum;
+  return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
+}
+
+void listDevices(void)
+{
+  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class STDDataStream<float>;
+template class STDDataStream<double>;
+
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
new file mode 100644
index 0000000..7279070
--- /dev/null
+++ b/src/std-data/STDDataStream.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "STD (data-oriented)"
+
+
+template <class T>
+class STDDataStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    int array_size;
+
+    // Device side pointers
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
+
+
+  public:
+    STDDataStream(const int, int) noexcept;
+    ~STDDataStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
+
diff --git a/src/std/model.cmake b/src/std-data/model.cmake
similarity index 100%
rename from src/std/model.cmake
rename to src/std-data/model.cmake
diff --git a/src/std/STDStream.cpp b/src/std-indices/STDIndicesStream.cpp
similarity index 79%
rename from src/std/STDStream.cpp
rename to src/std-indices/STDIndicesStream.cpp
index 536fd1c..e16fb91 100644
--- a/src/std/STDStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -3,7 +3,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-#include "STDStream.h"
+#include "STDIndicesStream.h"
 
 #include <algorithm>
 #include <execution>
@@ -16,13 +16,13 @@ auto exe_policy = std::execution::par_unseq;
 
 
 template <class T>
-STDStream<T>::STDStream(const int ARRAY_SIZE, int device)
+STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
   noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) 
 {
 }
 
 template <class T>
-void STDStream<T>::init_arrays(T initA, T initB, T initC)
+void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 {
   std::fill(exe_policy, a.begin(), a.end(), initA);
   std::fill(exe_policy, b.begin(), b.end(), initB);
@@ -30,7 +30,7 @@ void STDStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
   h_a = a;
   h_b = b;
@@ -38,14 +38,14 @@ void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
 }
 
 template <class T>
-void STDStream<T>::copy()
+void STDIndicesStream<T>::copy()
 {
   // c[i] = a[i]
   std::copy(exe_policy, a.begin(), a.end(), c.begin());
 }
 
 template <class T>
-void STDStream<T>::mul()
+void STDIndicesStream<T>::mul()
 {
   //  b[i] = scalar * c[i];
   std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
@@ -54,7 +54,7 @@ void STDStream<T>::mul()
 }
 
 template <class T>
-void STDStream<T>::add()
+void STDIndicesStream<T>::add()
 {
   //  c[i] = a[i] + b[i];
   std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
@@ -63,7 +63,7 @@ void STDStream<T>::add()
 }
 
 template <class T>
-void STDStream<T>::triad()
+void STDIndicesStream<T>::triad()
 {
   //  a[i] = b[i] + scalar * c[i];
   std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
@@ -72,7 +72,7 @@ void STDStream<T>::triad()
 }
 
 template <class T>
-void STDStream<T>::nstream()
+void STDIndicesStream<T>::nstream()
 {
   //  a[i] += b[i] + scalar * c[i];
   //  Need to do in two stages with C++11 STL.
@@ -85,7 +85,7 @@ void STDStream<T>::nstream()
    
 
 template <class T>
-T STDStream<T>::dot()
+T STDIndicesStream<T>::dot()
 {
   // sum = 0; sum += a[i]*b[i]; return sum;
   return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
@@ -105,6 +105,6 @@ std::string getDeviceDriver(const int)
 {
   return std::string("Device driver unavailable");
 }
-template class STDStream<float>;
-template class STDStream<double>;
+template class STDIndicesStream<float>;
+template class STDIndicesStream<double>;
 
diff --git a/src/std/STDStream.h b/src/std-indices/STDIndicesStream.h
similarity index 80%
rename from src/std/STDStream.h
rename to src/std-indices/STDIndicesStream.h
index b50bad1..87592a5 100644
--- a/src/std/STDStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -9,11 +9,11 @@
 #include <stdexcept>
 #include "Stream.h"
 
-#define IMPLEMENTATION_STRING "STD"
+#define IMPLEMENTATION_STRING "STD (index-oriented)"
 
-template <typename N = size_t>
+template <typename N>
 class ranged {
-  N from, to; 
+  N from, to;
 public:
   ranged(N from, N to ): from(from), to(to) {}
     class iterator {
@@ -22,16 +22,21 @@ public:
       using difference_type = N;
       using value_type = N;
       using pointer = const N*;
-      using reference = N&;
+      using reference = const N&;
       using iterator_category = std::random_access_iterator_tag;
       explicit iterator(N _num = 0) : num(_num) {}
+
       iterator& operator++() { num++; return *this; }
       iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
+      iterator operator+(const value_type v) const { return iterator(num + v); }
+
       bool operator==(iterator other) const { return num == other.num; }
       bool operator!=(iterator other) const { return *this != other; }
+      bool operator<(iterator other) const { return num < other.num; }
+
       reference operator*() const { return num;}
       difference_type operator-(const iterator &it) const { return num - it.num; }
-      value_type operator[](const difference_type &i) const { return num+i; }
+      value_type operator[](const difference_type &i) const { return num + i; }
 
     };
     iterator begin() { return iterator(from); }
@@ -39,7 +44,7 @@ public:
 };
 
 template <class T>
-class STDStream : public Stream<T>
+class STDIndicesStream : public Stream<T>
 {
   protected:
     // Size of arrays
@@ -55,8 +60,8 @@ class STDStream : public Stream<T>
 
 
   public:
-    STDStream(const int, int) noexcept;
-    ~STDStream() = default;
+    STDIndicesStream(const int, int) noexcept;
+    ~STDIndicesStream() = default;
 
     virtual void copy() override;
     virtual void add() override;
diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake
new file mode 100644
index 0000000..ef69f30
--- /dev/null
+++ b/src/std-indices/model.cmake
@@ -0,0 +1,33 @@
+
+register_flag_optional(CMAKE_CXX_COMPILER
+        "Any CXX compiler that is supported by CMake detection"
+        "c++")
+
+register_flag_optional(NVHPC_OFFLOAD
+        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
+         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
+
+         Possible values are:
+           cc35  - Compile for compute capability 3.5
+           cc50  - Compile for compute capability 5.0
+           cc60  - Compile for compute capability 6.0
+           cc62  - Compile for compute capability 6.2
+           cc70  - Compile for compute capability 7.0
+           cc72  - Compile for compute capability 7.2
+           cc75  - Compile for compute capability 7.5
+           cc80  - Compile for compute capability 8.0
+           ccall - Compile for all supported compute capabilities"
+        "")
+
+macro(setup)
+    set(CMAKE_CXX_STANDARD 17)
+
+    if (NVHPC_OFFLOAD)
+        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
+        # propagate flags to linker so that it links with the gpu stuff as well
+        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
+        register_append_link_flags(${NVHPC_FLAGS})
+    endif ()
+
+
+endmacro()
diff --git a/src/std20/STD20Stream.cpp b/src/std-ranges/STDRangesStream.cpp
similarity index 79%
rename from src/std20/STD20Stream.cpp
rename to src/std-ranges/STDRangesStream.cpp
index 8290033..de61528 100644
--- a/src/std20/STD20Stream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@@ -4,14 +4,14 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-#include "STD20Stream.hpp"
+#include "STDRangesStream.hpp"
 
 #include <algorithm>
 #include <execution>
 #include <ranges>
 
 template <class T>
-STD20Stream<T>::STD20Stream(const int ARRAY_SIZE, int device)
+STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
  : array_size{ARRAY_SIZE}
 {
   a = std::vector<T>(array_size);
@@ -20,7 +20,7 @@ STD20Stream<T>::STD20Stream(const int ARRAY_SIZE, int device)
 }
 
 template <class T>
-void STD20Stream<T>::init_arrays(T initA, T initB, T initC)
+void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
 {
   std::for_each_n(
     std::execution::par_unseq,
@@ -34,7 +34,7 @@ void STD20Stream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STD20Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
   // Element-wise copy.
   h_a = a;
@@ -43,7 +43,7 @@ void STD20Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::
 }
 
 template <class T>
-void STD20Stream<T>::copy()
+void STDRangesStream<T>::copy()
 {
   std::for_each_n(
     std::execution::par_unseq,
@@ -55,7 +55,7 @@ void STD20Stream<T>::copy()
 }
 
 template <class T>
-void STD20Stream<T>::mul()
+void STDRangesStream<T>::mul()
 {
   const T scalar = startScalar;
 
@@ -69,7 +69,7 @@ void STD20Stream<T>::mul()
 }
 
 template <class T>
-void STD20Stream<T>::add()
+void STDRangesStream<T>::add()
 {
   std::for_each_n(
     std::execution::par_unseq,
@@ -81,7 +81,7 @@ void STD20Stream<T>::add()
 }
 
 template <class T>
-void STD20Stream<T>::triad()
+void STDRangesStream<T>::triad()
 {
   const T scalar = startScalar;
 
@@ -95,7 +95,7 @@ void STD20Stream<T>::triad()
 }
 
 template <class T>
-void STD20Stream<T>::nstream()
+void STDRangesStream<T>::nstream()
 {
   const T scalar = startScalar;
 
@@ -109,7 +109,7 @@ void STD20Stream<T>::nstream()
 }
 
 template <class T>
-T STD20Stream<T>::dot()
+T STDRangesStream<T>::dot()
 {
   // sum += a[i] * b[i];
   return
@@ -133,6 +133,6 @@ std::string getDeviceDriver(const int)
   return std::string("Device driver unavailable");
 }
 
-template class STD20Stream<float>;
-template class STD20Stream<double>;
+template class STDRangesStream<float>;
+template class STDRangesStream<double>;
 
diff --git a/src/std20/STD20Stream.hpp b/src/std-ranges/STDRangesStream.hpp
similarity index 83%
rename from src/std20/STD20Stream.hpp
rename to src/std-ranges/STDRangesStream.hpp
index e5daa3c..890e893 100644
--- a/src/std20/STD20Stream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@@ -11,10 +11,10 @@
 
 #include "Stream.h"
 
-#define IMPLEMENTATION_STRING "C++20"
+#define IMPLEMENTATION_STRING "STD C++ ranges"
 
 template <class T>
-class STD20Stream : public Stream<T>
+class STDRangesStream : public Stream<T>
 {
   protected:
     // Size of arrays
@@ -26,8 +26,8 @@ class STD20Stream : public Stream<T>
     std::vector<T> c;
 
   public:
-    STD20Stream(const int, int);
-    ~STD20Stream() = default;
+    STDRangesStream(const int, int);
+    ~STDRangesStream() = default;
 
     virtual void copy() override;
     virtual void add() override;
diff --git a/src/std20/model.cmake b/src/std-ranges/model.cmake
similarity index 100%
rename from src/std20/model.cmake
rename to src/std-ranges/model.cmake

From fbd2e1bdc1b040db645f771b5d0efa7c01e2741c Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 22 Dec 2021 09:10:41 +0000
Subject: [PATCH 3/9] Add comments for custom range iterator Remove debug
 leftovers from CMakeList Update copyright

---
 CMakeLists.txt                       | 40 +---------------------------
 src/std-data/STDDataStream.cpp       |  3 ++-
 src/std-data/STDDataStream.h         |  3 ++-
 src/std-indices/STDIndicesStream.cpp |  3 ++-
 src/std-indices/STDIndicesStream.h   |  7 ++++-
 5 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d47dcad..3730602 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 
 project(BabelStream VERSION 3.5 LANGUAGES CXX)
 
+# uncomment for debugging build issues:
 #set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # some nicer defaults for standard C++
@@ -23,45 +24,6 @@ macro(setup_opencl_header_includes)
     endif ()
 endmacro()
 
-#set(MODEL SYCL)
-#set(SYCL_COMPILER COMPUTECPP)
-#set(SYCL_COMPILER_DIR /home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu)
-#set(MODEL RAJA)
-#set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.13.0/)
-#set(ENABLE_CUDA ON)
-#set(TARGET NVIDIA)
-#set(CUDA_TOOLKIT_ROOT_DIR /opt/cuda-11.2)
-#set(CUDA_ARCH sm_70)
-#set(BLT_DIR /home/tom/Downloads/blt-0.3.6/)
-
-#set(MODEL std-data)
-#set(CMAKE_CXX_COMPILER /home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/compilers/bin/nvc++)
-#set(NVHPC_OFFLOAD "cuda11.4,cc61")
-
-#set(MODEL CUDA)
-#set(ARCH  sm_70)
-#set(CMAKE_CUDA_COMPILER /opt/cuda-11.2/bin/nvcc)
-
-#set(MODEL OCL)
-#set(OpenCL_LIBRARY /opt/rocm-4.0.0/opencl/lib/libOpenCL.so)
-#set(OpenCL_INCLUDE_DIR /opt/rocm-4.0.0/opencl/lib)
-#set(RELEASE_FLAGS -Ofast)
-#set(CXX_EXTRA_FLAGS -O2)
-
-#set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++)
-#set(MODEL omp)
-##set(OFFLOAD "AMD:gfx803")
-#set(OFFLOAD "NVIDIA:sm_35")
-#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/)
-
-#set(OFFLOAD "AMD:_70")
-#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/ --gcc-toolchain=/home/tom/spack/opt/spack/linux-fedora33-zen2/gcc-10.2.1/gcc-8.3.0-latmjo2hl2yv53255xkwko7k3y7bx2vv)
-#set(CXX_EXTRA_LINKER_FLAGS  )
-#set(MODEL HIP)
-
-#set(MODEL KOKKOS)
-#set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.3.00/)
-
 # the final executable name
 set(EXE_NAME babelstream)
 
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index 64a37a4..a2628e1 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -1,4 +1,5 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index 7279070..f8bc302 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -1,4 +1,5 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index e16fb91..a88dd18 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -1,4 +1,5 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index 87592a5..66c8bb0 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -1,4 +1,5 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
@@ -11,6 +12,10 @@
 
 #define IMPLEMENTATION_STRING "STD (index-oriented)"
 
+
+// A lightweight counting iterator which will be used by the STL algorithms
+// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
+// implementation doesn't target
 template <typename N>
 class ranged {
   N from, to;

From 6ea09a9620674d1bf2626cacda582b7ec6256ee4 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 22 Dec 2021 09:14:46 +0000
Subject: [PATCH 4/9] Add oneDPL support for std-data and std-indices

---
 CMakeLists.txt                       |   2 +-
 src/std-data/STDDataStream.cpp       |  41 +++++++----
 src/std-data/STDDataStream.h         |  52 ++++++++++++--
 src/std-data/model.cmake             |  50 +++++++++++++
 src/std-indices/STDIndicesStream.cpp |  51 +++++++++-----
 src/std-indices/STDIndicesStream.h   | 102 ++++++++++++++++++---------
 src/std-indices/model.cmake          |  50 +++++++++++++
 7 files changed, 274 insertions(+), 74 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3730602..edf05f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(BabelStream VERSION 3.5 LANGUAGES CXX)
 
 # uncomment for debugging build issues:
-#set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # some nicer defaults for standard C++
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index a2628e1..0be23a3 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -5,21 +5,31 @@
 // source code
 
 #include "STDDataStream.h"
-
-#include <algorithm>
-#include <execution>
-#include <numeric>
-
-// There are three execution policies:
-// auto exe_policy = std::execution::seq;
-// auto exe_policy = std::execution::par;
-auto exe_policy = std::execution::par_unseq;
-
+#include <iostream>
 
 template <class T>
-STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
+STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device) : array_size{ARRAY_SIZE},
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})),
+    allocator(exe_policy.queue()),
+    a(array_size, allocator), b(array_size, allocator), c(array_size, allocator)
+#else
+    a(array_size), b(array_size),c(array_size)
+#endif
 {
+#if USE_ONEDPL
+  std::cout << "Using oneDPL backend: ";
+  #if defined(ONEDPL_USE_DPCPP_BACKEND)
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+  #elif defined(ONEDPL_USE_TBB_BACKEND)
+    std::cout << "TBB";
+  #elif defined(ONEDPL_USE_OPENMP_BACKEND)
+    std::cout << "OpenMP";
+  #else
+    std::cout << "Default";
+  #endif
+  std::cout << std::endl;
+#endif
 }
 
 template <class T>
@@ -33,9 +43,10 @@ void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 template <class T>
 void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  h_a = a;
-  h_b = b;
-  h_c = c;
+  // operator = is deleted because h_* vectors may have different allocator type compared to ours
+  std::copy(a.begin(), a.end(), h_a.begin());
+  std::copy(b.begin(), b.end(), h_b.begin());
+  std::copy(c.begin(), c.end(), h_c.begin());
 }
 
 template <class T>
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index f8bc302..c9a1f43 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -6,12 +6,35 @@
 
 #pragma once
 
-#include <iostream>
-#include <stdexcept>
 #include "Stream.h"
 
 #define IMPLEMENTATION_STRING "STD (data-oriented)"
 
+#ifdef USE_ONEDPL
+    #define PSTL_USAGE_WARNINGS 1
+
+    #include <oneapi/dpl/execution>
+    #include <oneapi/dpl/iterator>
+    #include <oneapi/dpl/algorithm>
+    #include <oneapi/dpl/memory>
+    #include <oneapi/dpl/numeric>
+
+    #ifdef ONEDPL_USE_DPCPP_BACKEND
+        #include <CL/sycl.hpp>
+    #endif
+#else
+    #include <algorithm>
+    #include <execution>
+    #include <numeric>
+
+    #if defined(ONEDPL_USE_DPCPP_BACKEND) || \
+        defined(ONEDPL_USE_TBB_BACKEND)   || \
+        defined(ONEDPL_USE_OPENMP_BACKEND)
+        #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified
+    #endif
+
+#endif
+
 
 template <class T>
 class STDDataStream : public Stream<T>
@@ -20,14 +43,31 @@ class STDDataStream : public Stream<T>
     // Size of arrays
     int array_size;
 
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    // SYCL oneDPL backend
+    using ExecutionPolicy = oneapi::dpl::execution::device_policy<>;
+    using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
+#elif defined(USE_ONEDPL)
+    // every other non-SYCL oneDPL backend (i.e TBB, OMP)
+    using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq);
+    using Allocator = std::allocator<T>;
+#else
+    // normal std execution policies
+    using ExecutionPolicy = decltype(std::execution::par_unseq);
+    using Allocator = std::allocator<T>;
+#endif
+
+    ExecutionPolicy exe_policy{};
+    Allocator allocator;
+
     // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
+    std::vector<T, Allocator> a;
+    std::vector<T, Allocator> b;
+    std::vector<T, Allocator> c;
 
 
   public:
-    STDDataStream(const int, int) noexcept;
+    STDDataStream(const int, int);
     ~STDDataStream() = default;
 
     virtual void copy() override;
diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake
index ef69f30..2454295 100644
--- a/src/std-data/model.cmake
+++ b/src/std-data/model.cmake
@@ -19,9 +19,30 @@ register_flag_optional(NVHPC_OFFLOAD
            ccall - Compile for all supported compute capabilities"
         "")
 
+register_flag_optional(ONEDPL_OFFLOAD
+        "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP.
+         This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp
+         compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option.
+         Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb.
+         The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/).
+         This should be done by oneAPI's `setvars.sh` script automatically.
+
+         Possible values are:
+           TBB   - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically.
+           OMP   - Use the OpenMP backend
+           DPCPP - Use the SYCL (via dpcpp) backend with the default selector.
+                  See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter
+                  on selecting a non-default device or SYCL backend."
+        "")
+
+
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD)
+        message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive")
+    endif ()
+
     if (NVHPC_OFFLOAD)
         set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
         # propagate flags to linker so that it links with the gpu stuff as well
@@ -30,4 +51,33 @@ macro(setup)
     endif ()
 
 
+    if (ONEDPL_OFFLOAD)
+        set(CXX_EXTRA_FLAGS)
+        set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so)
+        # propagate flags to linker so that it links with the gpu stuff as well
+        register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl)
+
+        # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
+        # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
+        register_definitions(
+                PSTL_USE_PARALLEL_POLICIES=0
+                _GLIBCXX_USE_TBB_PAR_BACKEND=0
+        )
+
+        register_definitions(USE_ONEDPL)
+        if (ONEDPL_OFFLOAD STREQUAL "TBB")
+            register_definitions(ONEDPL_USE_TBB_BACKEND=1)
+        elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP")
+            register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
+        elseif (ONEDPL_OFFLOAD STREQUAL "SYCL")
+            register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
+        else ()
+            message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}")
+        endif ()
+
+        # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here
+        register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so)
+
+    endif ()
+
 endmacro()
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index a88dd18..ceb9d3d 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -5,21 +5,32 @@
 // source code
 
 #include "STDIndicesStream.h"
-
-#include <algorithm>
-#include <execution>
-#include <numeric>
-
-// There are three execution policies:
-// auto exe_policy = std::execution::seq;
-// auto exe_policy = std::execution::par;
-auto exe_policy = std::execution::par_unseq;
-
+#include <iostream>
 
 template <class T>
-STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) 
+STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device) :
+array_size{ARRAY_SIZE}, range_start(0), range_end(array_size),
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})),
+    allocator(exe_policy.queue()),
+    a(array_size, allocator), b(array_size, allocator), c(array_size, allocator)
+#else
+a(array_size), b(array_size),c(array_size)
+#endif
 {
+#if USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+  #if defined(ONEDPL_USE_DPCPP_BACKEND)
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+  #elif defined(ONEDPL_USE_TBB_BACKEND)
+    std::cout << "TBB";
+  #elif defined(ONEDPL_USE_OPENMP_BACKEND)
+    std::cout << "OpenMP";
+  #else
+    std::cout << "Default";
+  #endif
+  std::cout << std::endl;
+#endif
 }
 
 template <class T>
@@ -33,11 +44,13 @@ void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 template <class T>
 void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  h_a = a;
-  h_b = b;
-  h_c = c;
+  // operator = is deleted because h_* vectors may have different allocator type compared to ours
+  std::copy(a.begin(), a.end(), h_a.begin());
+  std::copy(b.begin(), b.end(), h_b.begin());
+  std::copy(c.begin(), c.end(), h_c.begin());
 }
 
+
 template <class T>
 void STDIndicesStream<T>::copy()
 {
@@ -49,7 +62,7 @@ template <class T>
 void STDIndicesStream<T>::mul()
 {
   //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range_start, range_end, b.begin(), [&, scalar = startScalar](int i) {
     return scalar * c[i];
   });
 }
@@ -58,7 +71,7 @@ template <class T>
 void STDIndicesStream<T>::add()
 {
   //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
+  std::transform(exe_policy, range_start, range_end, c.begin(), [&](int i) {
     return a[i] + b[i];
   });
 }
@@ -67,7 +80,7 @@ template <class T>
 void STDIndicesStream<T>::triad()
 {
   //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) {
     return b[i] + scalar * c[i];
   });
 }
@@ -79,7 +92,7 @@ void STDIndicesStream<T>::nstream()
   //  Need to do in two stages with C++11 STL.
   //  1: a[i] += b[i]
   //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) {
     return a[i] + b[i] + scalar * c[i];
   });
 }
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index 66c8bb0..dfb63f6 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -6,46 +6,62 @@
 
 #pragma once
 
-#include <iostream>
-#include <stdexcept>
 #include "Stream.h"
 
 #define IMPLEMENTATION_STRING "STD (index-oriented)"
 
 
+
+#ifdef USE_ONEDPL
+    #define PSTL_USAGE_WARNINGS 1
+
+    #include <oneapi/dpl/execution>
+    #include <oneapi/dpl/iterator>
+    #include <oneapi/dpl/algorithm>
+    #include <oneapi/dpl/memory>
+    #include <oneapi/dpl/numeric>
+
+    #ifdef ONEDPL_USE_DPCPP_BACKEND
+        #include <CL/sycl.hpp>
+    #endif
+#else
+    #include <algorithm>
+    #include <execution>
+    #include <numeric>
+
+    #if defined(ONEDPL_USE_DPCPP_BACKEND) || \
+            defined(ONEDPL_USE_TBB_BACKEND)   || \
+            defined(ONEDPL_USE_OPENMP_BACKEND)
+    #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified
+    #endif
+
+#endif
+
 // A lightweight counting iterator which will be used by the STL algorithms
 // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
 // implementation doesn't target
 template <typename N>
-class ranged {
-  N from, to;
+class ranged_iterator {
+  N num;
 public:
-  ranged(N from, N to ): from(from), to(to) {}
-    class iterator {
-      N num;
-    public:
-      using difference_type = N;
-      using value_type = N;
-      using pointer = const N*;
-      using reference = const N&;
-      using iterator_category = std::random_access_iterator_tag;
-      explicit iterator(N _num = 0) : num(_num) {}
+  using difference_type = N;
+  using value_type = N;
+  using pointer = const N*;
+  using reference = const N&;
+  using iterator_category = std::random_access_iterator_tag;
+  explicit ranged_iterator(N _num = 0) : num(_num) {}
 
-      iterator& operator++() { num++; return *this; }
-      iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
-      iterator operator+(const value_type v) const { return iterator(num + v); }
+  ranged_iterator<N>& operator++() { num++; return *this; }
+  ranged_iterator<N> operator++(int) { ranged_iterator<N> retval = *this; ++(*this); return retval; }
+  ranged_iterator<N> operator+(const value_type v) const { return ranged_iterator<N>(num + v); }
 
-      bool operator==(iterator other) const { return num == other.num; }
-      bool operator!=(iterator other) const { return *this != other; }
-      bool operator<(iterator other) const { return num < other.num; }
+  bool operator==(ranged_iterator<N> other) const { return num == other.num; }
+  bool operator!=(ranged_iterator<N> other) const { return *this != other; }
+  bool operator<(ranged_iterator<N> other) const { return num < other.num; }
 
-      reference operator*() const { return num;}
-      difference_type operator-(const iterator &it) const { return num - it.num; }
-      value_type operator[](const difference_type &i) const { return num + i; }
-
-    };
-    iterator begin() { return iterator(from); }
-    iterator end() { return iterator(to >= from? to+1 : to-1); }
+  reference operator*() const { return num;}
+  difference_type operator-(const ranged_iterator<N> &it) const { return num - it.num; }
+  value_type operator[](const difference_type &i) const { return num + i; }
 };
 
 template <class T>
@@ -55,17 +71,37 @@ class STDIndicesStream : public Stream<T>
     // Size of arrays
     int array_size;
 
-    // induction range
-    ranged<int> range;
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    // SYCL oneDPL backend
+    using ExecutionPolicy = oneapi::dpl::execution::device_policy<>;
+    using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
+    using IteratorType = oneapi::dpl::counting_iterator<int>;
+#elif defined(USE_ONEDPL)
+    // every other non-SYCL oneDPL backend (i.e TBB, OMP)
+    using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq);
+    using Allocator = std::allocator<T>;
+    using IteratorType = oneapi::dpl::counting_iterator<int>;
+#else
+    // normal std execution policies
+    using ExecutionPolicy = decltype(std::execution::par_unseq);
+    using Allocator = std::allocator<T>;
+    using IteratorType = ranged_iterator<int>;
+#endif
+
+    IteratorType range_start;
+    IteratorType range_end;
+
+    ExecutionPolicy exe_policy{};
+    Allocator allocator;
 
     // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
+    std::vector<T, Allocator> a;
+    std::vector<T, Allocator> b;
+    std::vector<T, Allocator> c;
 
 
   public:
-    STDIndicesStream(const int, int) noexcept;
+    STDIndicesStream(const int, int);
     ~STDIndicesStream() = default;
 
     virtual void copy() override;
diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake
index ef69f30..2454295 100644
--- a/src/std-indices/model.cmake
+++ b/src/std-indices/model.cmake
@@ -19,9 +19,30 @@ register_flag_optional(NVHPC_OFFLOAD
            ccall - Compile for all supported compute capabilities"
         "")
 
+register_flag_optional(ONEDPL_OFFLOAD
+        "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP.
+         This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp
+         compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option.
+         Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb.
+         The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/).
+         This should be done by oneAPI's `setvars.sh` script automatically.
+
+         Possible values are:
+           TBB   - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically.
+           OMP   - Use the OpenMP backend
+           DPCPP - Use the SYCL (via dpcpp) backend with the default selector.
+                  See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter
+                  on selecting a non-default device or SYCL backend."
+        "")
+
+
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD)
+        message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive")
+    endif ()
+
     if (NVHPC_OFFLOAD)
         set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
         # propagate flags to linker so that it links with the gpu stuff as well
@@ -30,4 +51,33 @@ macro(setup)
     endif ()
 
 
+    if (ONEDPL_OFFLOAD)
+        set(CXX_EXTRA_FLAGS)
+        set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so)
+        # propagate flags to linker so that it links with the gpu stuff as well
+        register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl)
+
+        # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
+        # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
+        register_definitions(
+                PSTL_USE_PARALLEL_POLICIES=0
+                _GLIBCXX_USE_TBB_PAR_BACKEND=0
+        )
+
+        register_definitions(USE_ONEDPL)
+        if (ONEDPL_OFFLOAD STREQUAL "TBB")
+            register_definitions(ONEDPL_USE_TBB_BACKEND=1)
+        elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP")
+            register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
+        elseif (ONEDPL_OFFLOAD STREQUAL "SYCL")
+            register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
+        else ()
+            message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}")
+        endif ()
+
+        # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here
+        register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so)
+
+    endif ()
+
 endmacro()

From 2cd52228b733368e8cd57075f444c53f113c1f1a Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 22 Dec 2021 12:20:33 +0000
Subject: [PATCH 5/9] Revert "Add oneDPL support for std-data and std-indices"

This reverts commit 6ea09a9620674d1bf2626cacda582b7ec6256ee4.
---
 CMakeLists.txt                       |   2 +-
 src/std-data/STDDataStream.cpp       |  41 ++++-------
 src/std-data/STDDataStream.h         |  52 ++------------
 src/std-data/model.cmake             |  50 -------------
 src/std-indices/STDIndicesStream.cpp |  51 +++++---------
 src/std-indices/STDIndicesStream.h   | 102 +++++++++------------------
 src/std-indices/model.cmake          |  50 -------------
 7 files changed, 74 insertions(+), 274 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edf05f5..3730602 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(BabelStream VERSION 3.5 LANGUAGES CXX)
 
 # uncomment for debugging build issues:
-set(CMAKE_VERBOSE_MAKEFILE ON)
+#set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # some nicer defaults for standard C++
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index 0be23a3..a2628e1 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -5,31 +5,21 @@
 // source code
 
 #include "STDDataStream.h"
-#include <iostream>
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+
+// There are three execution policies:
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+auto exe_policy = std::execution::par_unseq;
+
 
 template <class T>
-STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device) : array_size{ARRAY_SIZE},
-#if defined(ONEDPL_USE_DPCPP_BACKEND)
-    exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})),
-    allocator(exe_policy.queue()),
-    a(array_size, allocator), b(array_size, allocator), c(array_size, allocator)
-#else
-    a(array_size), b(array_size),c(array_size)
-#endif
+STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
+  noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
 {
-#if USE_ONEDPL
-  std::cout << "Using oneDPL backend: ";
-  #if defined(ONEDPL_USE_DPCPP_BACKEND)
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-  #elif defined(ONEDPL_USE_TBB_BACKEND)
-    std::cout << "TBB";
-  #elif defined(ONEDPL_USE_OPENMP_BACKEND)
-    std::cout << "OpenMP";
-  #else
-    std::cout << "Default";
-  #endif
-  std::cout << std::endl;
-#endif
 }
 
 template <class T>
@@ -43,10 +33,9 @@ void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 template <class T>
 void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  // operator = is deleted because h_* vectors may have different allocator type compared to ours
-  std::copy(a.begin(), a.end(), h_a.begin());
-  std::copy(b.begin(), b.end(), h_b.begin());
-  std::copy(c.begin(), c.end(), h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index c9a1f43..f8bc302 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -6,35 +6,12 @@
 
 #pragma once
 
+#include <iostream>
+#include <stdexcept>
 #include "Stream.h"
 
 #define IMPLEMENTATION_STRING "STD (data-oriented)"
 
-#ifdef USE_ONEDPL
-    #define PSTL_USAGE_WARNINGS 1
-
-    #include <oneapi/dpl/execution>
-    #include <oneapi/dpl/iterator>
-    #include <oneapi/dpl/algorithm>
-    #include <oneapi/dpl/memory>
-    #include <oneapi/dpl/numeric>
-
-    #ifdef ONEDPL_USE_DPCPP_BACKEND
-        #include <CL/sycl.hpp>
-    #endif
-#else
-    #include <algorithm>
-    #include <execution>
-    #include <numeric>
-
-    #if defined(ONEDPL_USE_DPCPP_BACKEND) || \
-        defined(ONEDPL_USE_TBB_BACKEND)   || \
-        defined(ONEDPL_USE_OPENMP_BACKEND)
-        #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified
-    #endif
-
-#endif
-
 
 template <class T>
 class STDDataStream : public Stream<T>
@@ -43,31 +20,14 @@ class STDDataStream : public Stream<T>
     // Size of arrays
     int array_size;
 
-#if defined(ONEDPL_USE_DPCPP_BACKEND)
-    // SYCL oneDPL backend
-    using ExecutionPolicy = oneapi::dpl::execution::device_policy<>;
-    using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
-#elif defined(USE_ONEDPL)
-    // every other non-SYCL oneDPL backend (i.e TBB, OMP)
-    using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq);
-    using Allocator = std::allocator<T>;
-#else
-    // normal std execution policies
-    using ExecutionPolicy = decltype(std::execution::par_unseq);
-    using Allocator = std::allocator<T>;
-#endif
-
-    ExecutionPolicy exe_policy{};
-    Allocator allocator;
-
     // Device side pointers
-    std::vector<T, Allocator> a;
-    std::vector<T, Allocator> b;
-    std::vector<T, Allocator> c;
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
 
 
   public:
-    STDDataStream(const int, int);
+    STDDataStream(const int, int) noexcept;
     ~STDDataStream() = default;
 
     virtual void copy() override;
diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake
index 2454295..ef69f30 100644
--- a/src/std-data/model.cmake
+++ b/src/std-data/model.cmake
@@ -19,30 +19,9 @@ register_flag_optional(NVHPC_OFFLOAD
            ccall - Compile for all supported compute capabilities"
         "")
 
-register_flag_optional(ONEDPL_OFFLOAD
-        "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP.
-         This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp
-         compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option.
-         Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb.
-         The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/).
-         This should be done by oneAPI's `setvars.sh` script automatically.
-
-         Possible values are:
-           TBB   - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically.
-           OMP   - Use the OpenMP backend
-           DPCPP - Use the SYCL (via dpcpp) backend with the default selector.
-                  See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter
-                  on selecting a non-default device or SYCL backend."
-        "")
-
-
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
-    if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD)
-        message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive")
-    endif ()
-
     if (NVHPC_OFFLOAD)
         set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
         # propagate flags to linker so that it links with the gpu stuff as well
@@ -51,33 +30,4 @@ macro(setup)
     endif ()
 
 
-    if (ONEDPL_OFFLOAD)
-        set(CXX_EXTRA_FLAGS)
-        set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so)
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl)
-
-        # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
-        # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
-        register_definitions(
-                PSTL_USE_PARALLEL_POLICIES=0
-                _GLIBCXX_USE_TBB_PAR_BACKEND=0
-        )
-
-        register_definitions(USE_ONEDPL)
-        if (ONEDPL_OFFLOAD STREQUAL "TBB")
-            register_definitions(ONEDPL_USE_TBB_BACKEND=1)
-        elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP")
-            register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
-        elseif (ONEDPL_OFFLOAD STREQUAL "SYCL")
-            register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
-        else ()
-            message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}")
-        endif ()
-
-        # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here
-        register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so)
-
-    endif ()
-
 endmacro()
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index ceb9d3d..a88dd18 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -5,32 +5,21 @@
 // source code
 
 #include "STDIndicesStream.h"
-#include <iostream>
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+
+// There are three execution policies:
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+auto exe_policy = std::execution::par_unseq;
+
 
 template <class T>
-STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device) :
-array_size{ARRAY_SIZE}, range_start(0), range_end(array_size),
-#if defined(ONEDPL_USE_DPCPP_BACKEND)
-exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})),
-    allocator(exe_policy.queue()),
-    a(array_size, allocator), b(array_size, allocator), c(array_size, allocator)
-#else
-a(array_size), b(array_size),c(array_size)
-#endif
+STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
+  noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) 
 {
-#if USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-  #if defined(ONEDPL_USE_DPCPP_BACKEND)
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-  #elif defined(ONEDPL_USE_TBB_BACKEND)
-    std::cout << "TBB";
-  #elif defined(ONEDPL_USE_OPENMP_BACKEND)
-    std::cout << "OpenMP";
-  #else
-    std::cout << "Default";
-  #endif
-  std::cout << std::endl;
-#endif
 }
 
 template <class T>
@@ -44,13 +33,11 @@ void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 template <class T>
 void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  // operator = is deleted because h_* vectors may have different allocator type compared to ours
-  std::copy(a.begin(), a.end(), h_a.begin());
-  std::copy(b.begin(), b.end(), h_b.begin());
-  std::copy(c.begin(), c.end(), h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
-
 template <class T>
 void STDIndicesStream<T>::copy()
 {
@@ -62,7 +49,7 @@ template <class T>
 void STDIndicesStream<T>::mul()
 {
   //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range_start, range_end, b.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
     return scalar * c[i];
   });
 }
@@ -71,7 +58,7 @@ template <class T>
 void STDIndicesStream<T>::add()
 {
   //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range_start, range_end, c.begin(), [&](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
     return a[i] + b[i];
   });
 }
@@ -80,7 +67,7 @@ template <class T>
 void STDIndicesStream<T>::triad()
 {
   //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
     return b[i] + scalar * c[i];
   });
 }
@@ -92,7 +79,7 @@ void STDIndicesStream<T>::nstream()
   //  Need to do in two stages with C++11 STL.
   //  1: a[i] += b[i]
   //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
     return a[i] + b[i] + scalar * c[i];
   });
 }
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index dfb63f6..66c8bb0 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -6,62 +6,46 @@
 
 #pragma once
 
+#include <iostream>
+#include <stdexcept>
 #include "Stream.h"
 
 #define IMPLEMENTATION_STRING "STD (index-oriented)"
 
 
-
-#ifdef USE_ONEDPL
-    #define PSTL_USAGE_WARNINGS 1
-
-    #include <oneapi/dpl/execution>
-    #include <oneapi/dpl/iterator>
-    #include <oneapi/dpl/algorithm>
-    #include <oneapi/dpl/memory>
-    #include <oneapi/dpl/numeric>
-
-    #ifdef ONEDPL_USE_DPCPP_BACKEND
-        #include <CL/sycl.hpp>
-    #endif
-#else
-    #include <algorithm>
-    #include <execution>
-    #include <numeric>
-
-    #if defined(ONEDPL_USE_DPCPP_BACKEND) || \
-            defined(ONEDPL_USE_TBB_BACKEND)   || \
-            defined(ONEDPL_USE_OPENMP_BACKEND)
-    #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified
-    #endif
-
-#endif
-
 // A lightweight counting iterator which will be used by the STL algorithms
 // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
 // implementation doesn't target
 template <typename N>
-class ranged_iterator {
-  N num;
+class ranged {
+  N from, to;
 public:
-  using difference_type = N;
-  using value_type = N;
-  using pointer = const N*;
-  using reference = const N&;
-  using iterator_category = std::random_access_iterator_tag;
-  explicit ranged_iterator(N _num = 0) : num(_num) {}
+  ranged(N from, N to ): from(from), to(to) {}
+    class iterator {
+      N num;
+    public:
+      using difference_type = N;
+      using value_type = N;
+      using pointer = const N*;
+      using reference = const N&;
+      using iterator_category = std::random_access_iterator_tag;
+      explicit iterator(N _num = 0) : num(_num) {}
 
-  ranged_iterator<N>& operator++() { num++; return *this; }
-  ranged_iterator<N> operator++(int) { ranged_iterator<N> retval = *this; ++(*this); return retval; }
-  ranged_iterator<N> operator+(const value_type v) const { return ranged_iterator<N>(num + v); }
+      iterator& operator++() { num++; return *this; }
+      iterator operator++(int) { iterator retval = *this; ++(*this); return retval; }
+      iterator operator+(const value_type v) const { return iterator(num + v); }
 
-  bool operator==(ranged_iterator<N> other) const { return num == other.num; }
-  bool operator!=(ranged_iterator<N> other) const { return *this != other; }
-  bool operator<(ranged_iterator<N> other) const { return num < other.num; }
+      bool operator==(iterator other) const { return num == other.num; }
+      bool operator!=(iterator other) const { return *this != other; }
+      bool operator<(iterator other) const { return num < other.num; }
 
-  reference operator*() const { return num;}
-  difference_type operator-(const ranged_iterator<N> &it) const { return num - it.num; }
-  value_type operator[](const difference_type &i) const { return num + i; }
+      reference operator*() const { return num;}
+      difference_type operator-(const iterator &it) const { return num - it.num; }
+      value_type operator[](const difference_type &i) const { return num + i; }
+
+    };
+    iterator begin() { return iterator(from); }
+    iterator end() { return iterator(to >= from? to+1 : to-1); }
 };
 
 template <class T>
@@ -71,37 +55,17 @@ class STDIndicesStream : public Stream<T>
     // Size of arrays
     int array_size;
 
-#if defined(ONEDPL_USE_DPCPP_BACKEND)
-    // SYCL oneDPL backend
-    using ExecutionPolicy = oneapi::dpl::execution::device_policy<>;
-    using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
-    using IteratorType = oneapi::dpl::counting_iterator<int>;
-#elif defined(USE_ONEDPL)
-    // every other non-SYCL oneDPL backend (i.e TBB, OMP)
-    using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq);
-    using Allocator = std::allocator<T>;
-    using IteratorType = oneapi::dpl::counting_iterator<int>;
-#else
-    // normal std execution policies
-    using ExecutionPolicy = decltype(std::execution::par_unseq);
-    using Allocator = std::allocator<T>;
-    using IteratorType = ranged_iterator<int>;
-#endif
-
-    IteratorType range_start;
-    IteratorType range_end;
-
-    ExecutionPolicy exe_policy{};
-    Allocator allocator;
+    // induction range
+    ranged<int> range;
 
     // Device side pointers
-    std::vector<T, Allocator> a;
-    std::vector<T, Allocator> b;
-    std::vector<T, Allocator> c;
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
 
 
   public:
-    STDIndicesStream(const int, int);
+    STDIndicesStream(const int, int) noexcept;
     ~STDIndicesStream() = default;
 
     virtual void copy() override;
diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake
index 2454295..ef69f30 100644
--- a/src/std-indices/model.cmake
+++ b/src/std-indices/model.cmake
@@ -19,30 +19,9 @@ register_flag_optional(NVHPC_OFFLOAD
            ccall - Compile for all supported compute capabilities"
         "")
 
-register_flag_optional(ONEDPL_OFFLOAD
-        "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP.
-         This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp
-         compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option.
-         Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb.
-         The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/).
-         This should be done by oneAPI's `setvars.sh` script automatically.
-
-         Possible values are:
-           TBB   - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically.
-           OMP   - Use the OpenMP backend
-           DPCPP - Use the SYCL (via dpcpp) backend with the default selector.
-                  See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter
-                  on selecting a non-default device or SYCL backend."
-        "")
-
-
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
-    if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD)
-        message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive")
-    endif ()
-
     if (NVHPC_OFFLOAD)
         set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
         # propagate flags to linker so that it links with the gpu stuff as well
@@ -51,33 +30,4 @@ macro(setup)
     endif ()
 
 
-    if (ONEDPL_OFFLOAD)
-        set(CXX_EXTRA_FLAGS)
-        set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so)
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl)
-
-        # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
-        # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
-        register_definitions(
-                PSTL_USE_PARALLEL_POLICIES=0
-                _GLIBCXX_USE_TBB_PAR_BACKEND=0
-        )
-
-        register_definitions(USE_ONEDPL)
-        if (ONEDPL_OFFLOAD STREQUAL "TBB")
-            register_definitions(ONEDPL_USE_TBB_BACKEND=1)
-        elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP")
-            register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
-        elseif (ONEDPL_OFFLOAD STREQUAL "SYCL")
-            register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
-        else ()
-            message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}")
-        endif ()
-
-        # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here
-        register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so)
-
-    endif ()
-
 endmacro()

From 0b51c76af991abcf1a4f3ffd6aa54d27ee850f01 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 22 Dec 2021 12:28:28 +0000
Subject: [PATCH 6/9] Update src/std-data/STDDataStream.h

---
 src/std-data/STDDataStream.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index f8bc302..82dae7d 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 Tom Deakin
+// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this

From 91e83067a336bb0d43add962050251222befb06f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 22 Dec 2021 12:28:54 +0000
Subject: [PATCH 7/9] Update src/std-data/STDDataStream.h

---
 src/std-data/STDDataStream.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index 82dae7d..1c15b7c 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -1,5 +1,4 @@
 // Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-// University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code

From fe06ac03b0b50fee685ff37cb984771a8c017318 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 22 Dec 2021 12:31:29 +0000
Subject: [PATCH 8/9] Update src/std-data/STDDataStream.cpp

---
 src/std-data/STDDataStream.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index a2628e1..2261017 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -1,4 +1,5 @@
-// Copyright (c) 2020 Tom Deakin
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Updated in 2021 by University of Bristol
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this

From e620a7ac36114513e7ffa18b9e0b4d019a763d3a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 22 Dec 2021 12:33:49 +0000
Subject: [PATCH 9/9] ammend copyright headers

---
 src/std-data/STDDataStream.cpp       | 3 +--
 src/std-data/STDDataStream.h         | 1 +
 src/std-indices/STDIndicesStream.cpp | 2 +-
 src/std-indices/STDIndicesStream.h   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index 2261017..343e247 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -1,6 +1,5 @@
 // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-// Updated in 2021 by University of Bristol
-// University of Bristol HPC
+// Updated 2021 by University of Bristol
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index 1c15b7c..741fd6c 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+// Updated 2021 by University of Bristol
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index a88dd18..2221f90 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 Tom Deakin
+// Copyright (c) 2021 Tom Deakin and Tom Lin
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index 66c8bb0..bc068aa 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 Tom Deakin
+// Copyright (c) 2021 Tom Deakin and Tom Lin
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this