Add options for std::vector or raw pointers for TBB/STD

This commit is contained in:
Tom Lin 2022-07-24 21:17:17 +01:00
parent 240962722f
commit 5f6e714bdd
12 changed files with 200 additions and 72 deletions

View File

@ -10,60 +10,79 @@
#include <execution>
#include <numeric>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size)
{
}
noexcept : array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
array_size(ARRAY_SIZE),
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, a.begin(), a.end(), initA);
std::fill(exe_policy, b.begin(), b.end(), initB);
std::fill(exe_policy, c.begin(), c.end(), initC);
std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, BEGIN(c), END(c), initC);
}
template <class T>
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
h_a = a;
h_b = b;
h_c = c;
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
}
template <class T>
void STDDataStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin());
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
}
template <class T>
void STDDataStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; });
std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
}
template <class T>
void STDDataStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus<T>());
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
}
template <class T>
void STDDataStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
}
template <class T>
@ -73,8 +92,8 @@ void STDDataStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
}
@ -82,7 +101,7 @@ template <class T>
T STDDataStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
}
void listDevices(void)
@ -102,3 +121,5 @@ std::string getDeviceDriver(const int)
template class STDDataStream<float>;
template class STDDataStream<double>;
#undef BEGIN
#undef END

View File

@ -21,9 +21,11 @@ class STDDataStream : public Stream<T>
int array_size;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro()

View File

@ -10,46 +10,63 @@
#include <execution>
#include <numeric>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size)
{
}
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, a.begin(), a.end(), initA);
std::fill(exe_policy, b.begin(), b.end(), initB);
std::fill(exe_policy, c.begin(), c.end(), initC);
std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, BEGIN(c), END(c), initC);
}
template <class T>
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
h_a = a;
h_b = b;
h_c = c;
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
}
template <class T>
void STDIndicesStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, a.begin(), a.end(), c.begin());
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
}
template <class T>
void STDIndicesStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) {
return scalar * c[i];
});
}
@ -58,7 +75,7 @@ template <class T>
void STDIndicesStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) {
std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) {
return a[i] + b[i];
});
}
@ -67,7 +84,7 @@ template <class T>
void STDIndicesStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
return b[i] + scalar * c[i];
});
}
@ -79,7 +96,7 @@ void STDIndicesStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) {
return a[i] + b[i] + scalar * c[i];
});
}
@ -89,7 +106,7 @@ template <class T>
T STDIndicesStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0);
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
}
void listDevices(void)
@ -109,3 +126,5 @@ std::string getDeviceDriver(const int)
template class STDIndicesStream<float>;
template class STDIndicesStream<double>;
#undef BEGIN
#undef END

View File

@ -10,6 +10,11 @@
#include <stdexcept>
#include "Stream.h"
#ifdef USE_SPAN
#include <span>
#endif
#define IMPLEMENTATION_STRING "STD (index-oriented)"
@ -60,9 +65,11 @@ class STDIndicesStream : public Stream<T>
ranged<int> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -28,6 +32,8 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro()

View File

@ -10,20 +10,40 @@
#include <execution>
#include <ranges>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
{
a = std::vector<T>(array_size);
b = std::vector<T>(array_size);
c = std::vector<T>(array_size);
}
: array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
template <class T>
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size, // loop range
[&] (int i) {
a[i] = initA;
@ -37,16 +57,16 @@ template <class T>
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
}
template <class T>
void STDRangesStream<T>::copy()
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
c[i] = a[i];
@ -60,7 +80,7 @@ void STDRangesStream<T>::mul()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
b[i] = scalar * c[i];
@ -72,7 +92,7 @@ template <class T>
void STDRangesStream<T>::add()
{
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
c[i] = a[i] + b[i];
@ -86,7 +106,7 @@ void STDRangesStream<T>::triad()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] = b[i] + scalar * c[i];
@ -100,7 +120,7 @@ void STDRangesStream<T>::nstream()
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
exe_policy,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] += b[i] + scalar * c[i];
@ -114,8 +134,8 @@ T STDRangesStream<T>::dot()
// sum += a[i] * b[i];
return
std::transform_reduce(
std::execution::par_unseq,
a.begin(), a.end(), b.begin(), 0.0);
exe_policy,
BEGIN(a), END(a), BEGIN(b), 0.0);
}
void listDevices(void)
@ -136,3 +156,5 @@ std::string getDeviceDriver(const int)
template class STDRangesStream<float>;
template class STDRangesStream<double>;
#undef BEGIN
#undef END

View File

@ -21,9 +21,11 @@ class STDRangesStream : public Stream<T>
int array_size;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:
STDRangesStream(const int, int);

View File

@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
macro(setup)
# TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
@ -13,4 +17,7 @@ macro(setup)
unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
# and append our own:
register_append_cxx_flags(ANY -std=c++2a)
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro()

View File

@ -5,15 +5,37 @@
// source code
#include "TBBStream.hpp"
#include <cstdlib>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
: partitioner(), range(0, ARRAY_SIZE),
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
array_size(ARRAY_SIZE),
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
#endif
{
if(device != 0){
throw std::runtime_error("Device != 0 is not supported by TBB");
}
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
}
@ -35,9 +57,9 @@ template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
}
template <class T>
@ -132,3 +154,5 @@ std::string getDeviceDriver(const int)
template class TBBStream<float>;
template class TBBStream<double>;
#undef BEGIN
#undef END

View File

@ -40,10 +40,15 @@ class TBBStream : public Stream<T>
tbb_partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
size_t array_size;
T *a, *b, *c;
#endif
public:
TBBStream(const int, int);
~TBBStream() = default;

View File

@ -1,7 +1,7 @@
register_flag_optional(ONE_TBB_DIR
"Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
"")
@ -15,15 +15,22 @@ register_flag_optional(PARTITIONER
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
"AUTO")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
macro(setup)
if(ONE_TBB_DIR)
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
# docs on Intel's website refers to TBB_DIR which is not correct
endif()
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
register_definitions(PARTITIONER_${PARTITIONER})
if(USE_VECTOR)
register_definitions(USE_VECTOR)
endif()
endmacro()