Add options for std::vector or raw pointers for TBB/STD

2022-07-24 21:17:17 +01:00 · 2022-07-24 21:17:17 +01:00 · 5f6e714bdd
commit 5f6e714bdd
parent 240962722f
12 changed files with 200 additions and 72 deletions
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@ -10,60 +10,79 @@
 #include <execution>
 #include <numeric>

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+#ifdef USE_VECTOR
+#define BEGIN(x) (x).begin()
+#define END(x) (x).end()
+#else
+#define BEGIN(x) (x)
+#define END(x) ((x) + array_size)
+#endif
+
 // There are three execution policies:
 // auto exe_policy = std::execution::seq;
 // auto exe_policy = std::execution::par;
-auto exe_policy = std::execution::par_unseq;
+constexpr auto exe_policy = std::execution::par_unseq;


 template <class T>
 STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
-{
-}
+  noexcept : array_size{ARRAY_SIZE},
+#ifdef USE_VECTOR
+  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+#else
+array_size(ARRAY_SIZE),
+  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+#endif
+{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }

 template <class T>
 void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::fill(exe_policy, a.begin(), a.end(), initA);
-  std::fill(exe_policy, b.begin(), b.end(), initB);
-  std::fill(exe_policy, c.begin(), c.end(), initC);
+  std::fill(exe_policy, BEGIN(a), END(a), initA);
+  std::fill(exe_policy, BEGIN(b), END(b), initB);
+  std::fill(exe_policy, BEGIN(c), END(c), initC);
 }

 template <class T>
 void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  h_a = a;
-  h_b = b;
-  h_c = c;
+  std::copy(BEGIN(a), END(a), h_a.begin());
+  std::copy(BEGIN(b), END(b), h_b.begin());
+  std::copy(BEGIN(c), END(c), h_c.begin());
 }

 template <class T>
 void STDDataStream<T>::copy()
 {
  // c[i] = a[i]
-  std::copy(exe_policy, a.begin(), a.end(), c.begin());
+  std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
 }

 template <class T>
 void STDDataStream<T>::mul()
 {
  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; });
+  std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
 }

 template <class T>
 void STDDataStream<T>::add()
 {
  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>());
+  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
 }

 template <class T>
 void STDDataStream<T>::triad()
 {
  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
+  std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
 }

 template <class T>
@ -73,8 +92,8 @@ void STDDataStream<T>::nstream()
  //  Need to do in two stages with C++11 STL.
  //  1: a[i] += b[i]
  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; });
-  std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
+  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
+  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
 }
   

@ -82,7 +101,7 @@ template <class T>
 T STDDataStream<T>::dot()
 {
  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
+  return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
 }

 void listDevices(void)
@ -102,3 +121,5 @@ std::string getDeviceDriver(const int)
 template class STDDataStream<float>;
 template class STDDataStream<double>;

+#undef BEGIN
+#undef END
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@ -21,9 +21,11 @@ class STDDataStream : public Stream<T>
    int array_size;

    // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
+#ifdef USE_VECTOR
+    std::vector<T> a, b, c;
+#else
+    T *a, *b, *c;
+#endif


  public:
--- a/src/std-data/model.cmake
+++ b/src/std-data/model.cmake
@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")

+register_flag_optional(USE_VECTOR
+        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
+        "OFF")
+
 register_flag_optional(NVHPC_OFFLOAD
        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
        register_append_link_flags(${NVHPC_FLAGS})
    endif ()
-
+    if(USE_VECTOR)
+        register_definitions(USE_VECTOR)
+    endif()

 endmacro()
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@ -10,46 +10,63 @@
 #include <execution>
 #include <numeric>

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+#ifdef USE_VECTOR
+#define BEGIN(x) (x).begin()
+#define END(x) (x).end()
+#else
+#define BEGIN(x) (x)
+#define END(x) ((x) + array_size)
+#endif
+
 // There are three execution policies:
 // auto exe_policy = std::execution::seq;
 // auto exe_policy = std::execution::par;
-auto exe_policy = std::execution::par_unseq;
-
+constexpr auto exe_policy = std::execution::par_unseq;

 template <class T>
 STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) 
-{
-}
+  noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
+#ifdef USE_VECTOR
+  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+#else
+  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+#endif
+{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }

 template <class T>
 void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::fill(exe_policy, a.begin(), a.end(), initA);
-  std::fill(exe_policy, b.begin(), b.end(), initB);
-  std::fill(exe_policy, c.begin(), c.end(), initC);
+  std::fill(exe_policy, BEGIN(a), END(a), initA);
+  std::fill(exe_policy, BEGIN(b), END(b), initB);
+  std::fill(exe_policy, BEGIN(c), END(c), initC);
 }

 template <class T>
 void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  h_a = a;
-  h_b = b;
-  h_c = c;
+  std::copy(BEGIN(a), END(a), h_a.begin());
+  std::copy(BEGIN(b), END(b), h_b.begin());
+  std::copy(BEGIN(c), END(c), h_c.begin());
 }

 template <class T>
 void STDIndicesStream<T>::copy()
 {
  // c[i] = a[i]
-  std::copy(exe_policy, a.begin(), a.end(), c.begin());
+  std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
 }

 template <class T>
 void STDIndicesStream<T>::mul()
 {
  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) {
    return scalar * c[i];
  });
 }
@ -58,7 +75,7 @@ template <class T>
 void STDIndicesStream<T>::add()
 {
  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) {
    return a[i] + b[i];
  });
 }
@ -67,7 +84,7 @@ template <class T>
 void STDIndicesStream<T>::triad()
 {
  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
    return b[i] + scalar * c[i];
  });
 }
@ -79,7 +96,7 @@ void STDIndicesStream<T>::nstream()
  //  Need to do in two stages with C++11 STL.
  //  1: a[i] += b[i]
  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
    return a[i] + b[i] + scalar * c[i];
  });
 }
@ -89,7 +106,7 @@ template <class T>
 T STDIndicesStream<T>::dot()
 {
  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
+  return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
 }

 void listDevices(void)
@ -109,3 +126,5 @@ std::string getDeviceDriver(const int)
 template class STDIndicesStream<float>;
 template class STDIndicesStream<double>;

+#undef BEGIN
+#undef END
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@ -10,6 +10,11 @@
 #include <stdexcept>
 #include "Stream.h"

+#ifdef USE_SPAN
+#include <span>
+#endif
+
+
 #define IMPLEMENTATION_STRING "STD (index-oriented)"


@ -60,9 +65,11 @@ class STDIndicesStream : public Stream<T>
    ranged<int> range;

    // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
+#ifdef USE_VECTOR
+    std::vector<T> a, b, c;
+#else
+    T *a, *b, *c;
+#endif


  public:
--- a/src/std-indices/model.cmake
+++ b/src/std-indices/model.cmake
@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")

+register_flag_optional(USE_VECTOR
+        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
+        "OFF")
+
 register_flag_optional(NVHPC_OFFLOAD
        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
        register_append_link_flags(${NVHPC_FLAGS})
    endif ()
-
+    if(USE_VECTOR)
+        register_definitions(USE_VECTOR)
+    endif()

 endmacro()
--- a/src/std-ranges/STDRangesStream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@ -10,20 +10,40 @@
 #include <execution>
 #include <ranges>

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+#ifdef USE_VECTOR
+#define BEGIN(x) (x).begin()
+#define END(x) (x).end()
+#else
+#define BEGIN(x) (x)
+#define END(x) ((x) + array_size)
+#endif
+
+// There are three execution policies:
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+constexpr auto exe_policy = std::execution::par_unseq;
+
 template <class T>
 STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
- : array_size{ARRAY_SIZE}
-{
-  a = std::vector<T>(array_size);
-  b = std::vector<T>(array_size);
-  c = std::vector<T>(array_size);
-}
+ : array_size{ARRAY_SIZE},
+#ifdef USE_VECTOR
+  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+#else
+  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+#endif
+{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }

 template <class T>
 void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
 {
  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size, // loop range
    [&] (int i) {
      a[i] = initA;
@ -37,16 +57,16 @@ template <class T>
 void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  // Element-wise copy.
-  h_a = a;
-  h_b = b;
-  h_c = c;
+    std::copy(BEGIN(a), END(a), h_a.begin());
+    std::copy(BEGIN(b), END(b), h_b.begin());
+    std::copy(BEGIN(c), END(c), h_c.begin());
 }

 template <class T>
 void STDRangesStream<T>::copy()
 {
  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size,
    [&] (int i) {
      c[i] = a[i];
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
  const T scalar = startScalar;

  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size,
    [&] (int i) {
      b[i] = scalar * c[i];
@ -72,7 +92,7 @@ template <class T>
 void STDRangesStream<T>::add()
 {
  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size,
    [&] (int i) {
      c[i] = a[i] + b[i];
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
  const T scalar = startScalar;

  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size,
    [&] (int i) {
      a[i] = b[i] + scalar * c[i];
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
  const T scalar = startScalar;

  std::for_each_n(
-    std::execution::par_unseq,
+    exe_policy,
    std::views::iota(0).begin(), array_size,
    [&] (int i) {
      a[i] += b[i] + scalar * c[i];
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
  // sum += a[i] * b[i];
  return
    std::transform_reduce(
-      std::execution::par_unseq,
-      a.begin(), a.end(), b.begin(), 0.0);
+      exe_policy,
+      BEGIN(a), END(a), BEGIN(b), 0.0);
 }

 void listDevices(void)
@ -136,3 +156,5 @@ std::string getDeviceDriver(const int)
 template class STDRangesStream<float>;
 template class STDRangesStream<double>;

+#undef BEGIN
+#undef END
--- a/src/std-ranges/STDRangesStream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@ -21,9 +21,11 @@ class STDRangesStream : public Stream<T>
    int array_size;

    // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
+#ifdef USE_VECTOR
+    std::vector<T> a, b, c;
+#else
+    T *a, *b, *c;
+#endif

  public:
    STDRangesStream(const int, int);
--- a/src/std-ranges/model.cmake
+++ b/src/std-ranges/model.cmake
@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
        "c++")

+register_flag_optional(USE_VECTOR
+        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
+        "OFF")
+
 macro(setup)

    # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
@ -13,4 +17,7 @@ macro(setup)
    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
    # and append our own:
    register_append_cxx_flags(ANY -std=c++2a)
+    if(USE_VECTOR)
+        register_definitions(USE_VECTOR)
+    endif()
 endmacro()
--- a/src/tbb/TBBStream.cpp
+++ b/src/tbb/TBBStream.cpp
@ -5,15 +5,37 @@
 // source code

 #include "TBBStream.hpp"
+#include <cstdlib>
+
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+#ifdef USE_VECTOR
+#define BEGIN(x) (x).begin()
+#define END(x) (x).end()
+#else
+#define BEGIN(x) (x)
+#define END(x) ((x) + array_size)
+#endif

 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
- : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+ : partitioner(), range(0, ARRAY_SIZE),
+#ifdef USE_VECTOR
+   a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+#else
+   array_size(ARRAY_SIZE),
+   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
+   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+#endif
 {
  if(device != 0){
    throw std::runtime_error("Device != 0 is not supported by TBB");
  }
  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
+  std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 }


@ -35,9 +57,9 @@ template <class T>
 void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  // Element-wise copy.
-  h_a = a;
-  h_b = b;
-  h_c = c;
+  std::copy(BEGIN(a), END(a), h_a.begin());
+  std::copy(BEGIN(b), END(b), h_b.begin());
+  std::copy(BEGIN(c), END(c), h_c.begin());
 }

 template <class T>
@ -132,3 +154,5 @@ std::string getDeviceDriver(const int)
 template class TBBStream<float>;
 template class TBBStream<double>;

+#undef BEGIN
+#undef END
--- a/src/tbb/TBBStream.hpp
+++ b/src/tbb/TBBStream.hpp
@ -40,10 +40,15 @@ class TBBStream : public Stream<T>
    tbb_partitioner partitioner;
    tbb::blocked_range<size_t> range;
    // Device side pointers
-    std::vector<T> a;
-    std::vector<T> b;
-    std::vector<T> c;
- 
+#ifdef USE_VECTOR
+    std::vector<T> a, b, c;
+#else
+    size_t array_size;
+    T *a, *b, *c;
+#endif
+
+
+
  public:
    TBBStream(const int, int);
    ~TBBStream() = default;
--- a/src/tbb/model.cmake
+++ b/src/tbb/model.cmake
@ -1,7 +1,7 @@

 register_flag_optional(ONE_TBB_DIR
        "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
-         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
+         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
        "")


@ -15,15 +15,22 @@ register_flag_optional(PARTITIONER
            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
        "AUTO")

+register_flag_optional(USE_VECTOR
+        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
+        "OFF")
+
 macro(setup)
    if(ONE_TBB_DIR)
        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
        # docs on Intel's website refers to TBB_DIR which is not correct
    endif()
-    
+

    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
    find_package(TBB REQUIRED)
    register_link_library(TBB::tbb)
    register_definitions(PARTITIONER_${PARTITIONER})
+    if(USE_VECTOR)
+        register_definitions(USE_VECTOR)
+    endif()
 endmacro()