Add options for std::vector or raw pointers for TBB/STD

This commit is contained in:
Tom Lin 2022-07-24 21:17:17 +01:00
parent 240962722f
commit 5f6e714bdd
12 changed files with 200 additions and 72 deletions

View File

@ -10,60 +10,79 @@
#include <execution> #include <execution>
#include <numeric> #include <numeric>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies: // There are three execution policies:
// auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par; // auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq; constexpr auto exe_policy = std::execution::par_unseq;
template <class T> template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device) STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) noexcept : array_size{ARRAY_SIZE},
{ #ifdef USE_VECTOR
} a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
array_size(ARRAY_SIZE),
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T> template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC) void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::fill(exe_policy, a.begin(), a.end(), initA); std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, b.begin(), b.end(), initB); std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, c.begin(), c.end(), initC); std::fill(exe_policy, BEGIN(c), END(c), initC);
} }
template <class T> template <class T>
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
h_a = a; std::copy(BEGIN(a), END(a), h_a.begin());
h_b = b; std::copy(BEGIN(b), END(b), h_b.begin());
h_c = c; std::copy(BEGIN(c), END(c), h_c.begin());
} }
template <class T> template <class T>
void STDDataStream<T>::copy() void STDDataStream<T>::copy()
{ {
// c[i] = a[i] // c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin()); std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
} }
template <class T> template <class T>
void STDDataStream<T>::mul() void STDDataStream<T>::mul()
{ {
// b[i] = scalar * c[i]; // b[i] = scalar * c[i];
std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
} }
template <class T> template <class T>
void STDDataStream<T>::add() void STDDataStream<T>::add()
{ {
// c[i] = a[i] + b[i]; // c[i] = a[i] + b[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>()); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
} }
template <class T> template <class T>
void STDDataStream<T>::triad() void STDDataStream<T>::triad()
{ {
// a[i] = b[i] + scalar * c[i]; // a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
} }
template <class T> template <class T>
@ -73,8 +92,8 @@ void STDDataStream<T>::nstream()
// Need to do in two stages with C++11 STL. // Need to do in two stages with C++11 STL.
// 1: a[i] += b[i] // 1: a[i] += b[i]
// 2: a[i] += scalar * c[i]; // 2: a[i] += scalar * c[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
} }
@ -82,7 +101,7 @@ template <class T>
T STDDataStream<T>::dot() T STDDataStream<T>::dot()
{ {
// sum = 0; sum += a[i]*b[i]; return sum; // sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
} }
void listDevices(void) void listDevices(void)
@ -102,3 +121,5 @@ std::string getDeviceDriver(const int)
template class STDDataStream<float>; template class STDDataStream<float>;
template class STDDataStream<double>; template class STDDataStream<double>;
#undef BEGIN
#undef END

View File

@ -21,9 +21,11 @@ class STDDataStream : public Stream<T>
int array_size; int array_size;
// Device side pointers // Device side pointers
std::vector<T> a; #ifdef USE_VECTOR
std::vector<T> b; std::vector<T> a, b, c;
std::vector<T> c; #else
T *a, *b, *c;
#endif
public: public:

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection" "Any CXX compiler that is supported by CMake detection"
"c++") "c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS})
endif () endif ()
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro() endmacro()

View File

@ -10,46 +10,63 @@
#include <execution> #include <execution>
#include <numeric> #include <numeric>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies: // There are three execution policies:
// auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par; // auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq; constexpr auto exe_policy = std::execution::par_unseq;
template <class T> template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device) STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
{ #ifdef USE_VECTOR
} a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T> template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC) void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::fill(exe_policy, a.begin(), a.end(), initA); std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, b.begin(), b.end(), initB); std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, c.begin(), c.end(), initC); std::fill(exe_policy, BEGIN(c), END(c), initC);
} }
template <class T> template <class T>
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
h_a = a; std::copy(BEGIN(a), END(a), h_a.begin());
h_b = b; std::copy(BEGIN(b), END(b), h_b.begin());
h_c = c; std::copy(BEGIN(c), END(c), h_c.begin());
} }
template <class T> template <class T>
void STDIndicesStream<T>::copy() void STDIndicesStream<T>::copy()
{ {
// c[i] = a[i] // c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin()); std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
} }
template <class T> template <class T>
void STDIndicesStream<T>::mul() void STDIndicesStream<T>::mul()
{ {
// b[i] = scalar * c[i]; // b[i] = scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) {
return scalar * c[i]; return scalar * c[i];
}); });
} }
@ -58,7 +75,7 @@ template <class T>
void STDIndicesStream<T>::add() void STDIndicesStream<T>::add()
{ {
// c[i] = a[i] + b[i]; // c[i] = a[i] + b[i];
std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) {
return a[i] + b[i]; return a[i] + b[i];
}); });
} }
@ -67,7 +84,7 @@ template <class T>
void STDIndicesStream<T>::triad() void STDIndicesStream<T>::triad()
{ {
// a[i] = b[i] + scalar * c[i]; // a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
return b[i] + scalar * c[i]; return b[i] + scalar * c[i];
}); });
} }
@ -79,7 +96,7 @@ void STDIndicesStream<T>::nstream()
// Need to do in two stages with C++11 STL. // Need to do in two stages with C++11 STL.
// 1: a[i] += b[i] // 1: a[i] += b[i]
// 2: a[i] += scalar * c[i]; // 2: a[i] += scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
return a[i] + b[i] + scalar * c[i]; return a[i] + b[i] + scalar * c[i];
}); });
} }
@ -89,7 +106,7 @@ template <class T>
T STDIndicesStream<T>::dot() T STDIndicesStream<T>::dot()
{ {
// sum = 0; sum += a[i]*b[i]; return sum; // sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
} }
void listDevices(void) void listDevices(void)
@ -109,3 +126,5 @@ std::string getDeviceDriver(const int)
template class STDIndicesStream<float>; template class STDIndicesStream<float>;
template class STDIndicesStream<double>; template class STDIndicesStream<double>;
#undef BEGIN
#undef END

View File

@ -10,6 +10,11 @@
#include <stdexcept> #include <stdexcept>
#include "Stream.h" #include "Stream.h"
#ifdef USE_SPAN
#include <span>
#endif
#define IMPLEMENTATION_STRING "STD (index-oriented)" #define IMPLEMENTATION_STRING "STD (index-oriented)"
@ -60,9 +65,11 @@ class STDIndicesStream : public Stream<T>
ranged<int> range; ranged<int> range;
// Device side pointers // Device side pointers
std::vector<T> a; #ifdef USE_VECTOR
std::vector<T> b; std::vector<T> a, b, c;
std::vector<T> c; #else
T *a, *b, *c;
#endif
public: public:

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection" "Any CXX compiler that is supported by CMake detection"
"c++") "c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS})
endif () endif ()
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro() endmacro()

View File

@ -10,20 +10,40 @@
#include <execution> #include <execution>
#include <ranges> #include <ranges>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T> template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device) STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE} : array_size{ARRAY_SIZE},
{ #ifdef USE_VECTOR
a = std::vector<T>(array_size); a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
b = std::vector<T>(array_size); #else
c = std::vector<T>(array_size); a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
} b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T> template <class T>
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC) void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, // loop range std::views::iota(0).begin(), array_size, // loop range
[&] (int i) { [&] (int i) {
a[i] = initA; a[i] = initA;
@ -37,16 +57,16 @@ template <class T>
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
// Element-wise copy. // Element-wise copy.
h_a = a; std::copy(BEGIN(a), END(a), h_a.begin());
h_b = b; std::copy(BEGIN(b), END(b), h_b.begin());
h_c = c; std::copy(BEGIN(c), END(c), h_c.begin());
} }
template <class T> template <class T>
void STDRangesStream<T>::copy() void STDRangesStream<T>::copy()
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
c[i] = a[i]; c[i] = a[i];
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
b[i] = scalar * c[i]; b[i] = scalar * c[i];
@ -72,7 +92,7 @@ template <class T>
void STDRangesStream<T>::add() void STDRangesStream<T>::add()
{ {
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
a[i] = b[i] + scalar * c[i]; a[i] = b[i] + scalar * c[i];
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
const T scalar = startScalar; const T scalar = startScalar;
std::for_each_n( std::for_each_n(
std::execution::par_unseq, exe_policy,
std::views::iota(0).begin(), array_size, std::views::iota(0).begin(), array_size,
[&] (int i) { [&] (int i) {
a[i] += b[i] + scalar * c[i]; a[i] += b[i] + scalar * c[i];
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
// sum += a[i] * b[i]; // sum += a[i] * b[i];
return return
std::transform_reduce( std::transform_reduce(
std::execution::par_unseq, exe_policy,
a.begin(), a.end(), b.begin(), 0.0); BEGIN(a), END(a), BEGIN(b), 0.0);
} }
void listDevices(void) void listDevices(void)
@ -136,3 +156,5 @@ std::string getDeviceDriver(const int)
template class STDRangesStream<float>; template class STDRangesStream<float>;
template class STDRangesStream<double>; template class STDRangesStream<double>;
#undef BEGIN
#undef END

View File

@ -21,9 +21,11 @@ class STDRangesStream : public Stream<T>
int array_size; int array_size;
// Device side pointers // Device side pointers
std::vector<T> a; #ifdef USE_VECTOR
std::vector<T> b; std::vector<T> a, b, c;
std::vector<T> c; #else
T *a, *b, *c;
#endif
public: public:
STDRangesStream(const int, int); STDRangesStream(const int, int);

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
"c++") "c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
macro(setup) macro(setup)
# TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
@ -13,4 +17,7 @@ macro(setup)
unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
# and append our own: # and append our own:
register_append_cxx_flags(ANY -std=c++2a) register_append_cxx_flags(ANY -std=c++2a)
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro() endmacro()

View File

@ -5,15 +5,37 @@
// source code // source code
#include "TBBStream.hpp" #include "TBBStream.hpp"
#include <cstdlib>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
template <class T> template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device) TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) : partitioner(), range(0, ARRAY_SIZE),
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
array_size(ARRAY_SIZE),
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ {
if(device != 0){ if(device != 0){
throw std::runtime_error("Device != 0 is not supported by TBB"); throw std::runtime_error("Device != 0 is not supported by TBB");
} }
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
} }
@ -35,9 +57,9 @@ template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c) void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{ {
// Element-wise copy. // Element-wise copy.
h_a = a; std::copy(BEGIN(a), END(a), h_a.begin());
h_b = b; std::copy(BEGIN(b), END(b), h_b.begin());
h_c = c; std::copy(BEGIN(c), END(c), h_c.begin());
} }
template <class T> template <class T>
@ -132,3 +154,5 @@ std::string getDeviceDriver(const int)
template class TBBStream<float>; template class TBBStream<float>;
template class TBBStream<double>; template class TBBStream<double>;
#undef BEGIN
#undef END

View File

@ -40,9 +40,14 @@ class TBBStream : public Stream<T>
tbb_partitioner partitioner; tbb_partitioner partitioner;
tbb::blocked_range<size_t> range; tbb::blocked_range<size_t> range;
// Device side pointers // Device side pointers
std::vector<T> a; #ifdef USE_VECTOR
std::vector<T> b; std::vector<T> a, b, c;
std::vector<T> c; #else
size_t array_size;
T *a, *b, *c;
#endif
public: public:
TBBStream(const int, int); TBBStream(const int, int);

View File

@ -15,6 +15,10 @@ register_flag_optional(PARTITIONER
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
"AUTO") "AUTO")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
macro(setup) macro(setup)
if(ONE_TBB_DIR) if(ONE_TBB_DIR)
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
@ -26,4 +30,7 @@ macro(setup)
find_package(TBB REQUIRED) find_package(TBB REQUIRED)
register_link_library(TBB::tbb) register_link_library(TBB::tbb)
register_definitions(PARTITIONER_${PARTITIONER}) register_definitions(PARTITIONER_${PARTITIONER})
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro() endmacro()