From 5f6e714bdd8d34c305876f392df9cc569df33e5c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:17:17 +0100 Subject: [PATCH] Add options for std::vector or raw pointers for TBB/STD --- src/std-data/STDDataStream.cpp | 55 ++++++++++++++++++--------- src/std-data/STDDataStream.h | 8 ++-- src/std-data/model.cmake | 8 +++- src/std-indices/STDIndicesStream.cpp | 53 +++++++++++++++++--------- src/std-indices/STDIndicesStream.h | 13 +++++-- src/std-indices/model.cmake | 8 +++- src/std-ranges/STDRangesStream.cpp | 56 +++++++++++++++++++--------- src/std-ranges/STDRangesStream.hpp | 8 ++-- src/std-ranges/model.cmake | 7 ++++ src/tbb/TBBStream.cpp | 32 ++++++++++++++-- src/tbb/TBBStream.hpp | 13 +++++-- src/tbb/model.cmake | 11 +++++- 12 files changed, 200 insertions(+), 72 deletions(-) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 343e247..2dead3b 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -10,60 +10,79 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; +constexpr auto exe_policy = std::execution::par_unseq; template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else +array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus()); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -73,8 +92,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -82,7 +101,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -102,3 +121,5 @@ std::string getDeviceDriver(const int) template class STDDataStream; template class STDDataStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 741fd6c..84b4dcf 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -21,9 +21,11 @@ class STDDataStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index ef69f30..6f87bc9 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 2221f90..8c0958c 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -10,46 +10,63 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +constexpr auto exe_policy = std::execution::par_unseq; template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, range(0, array_size), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -58,7 +75,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) { return a[i] + b[i]; }); } @@ -67,7 +84,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -79,7 +96,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -89,7 +106,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -109,3 +126,5 @@ std::string getDeviceDriver(const int) template class STDIndicesStream; template class STDIndicesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 26c7cb0..6810888 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -10,6 +10,11 @@ #include #include "Stream.h" +#ifdef USE_SPAN +#include +#endif + + #define IMPLEMENTATION_STRING "STD (index-oriented)" @@ -60,9 +65,11 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index ef69f30..6f87bc9 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index de61528..fc71fee 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -10,20 +10,40 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +constexpr auto exe_policy = std::execution::par_unseq; + template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} -{ - a = std::vector(array_size); - b = std::vector(array_size); - c = std::vector(array_size); -} + : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, // loop range [&] (int i) { a[i] = initA; @@ -37,16 +57,16 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDRangesStream::copy() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i]; @@ -60,7 +80,7 @@ void STDRangesStream::mul() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { b[i] = scalar * c[i]; @@ -72,7 +92,7 @@ template void STDRangesStream::add() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i] + b[i]; @@ -86,7 +106,7 @@ void STDRangesStream::triad() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] = b[i] + scalar * c[i]; @@ -100,7 +120,7 @@ void STDRangesStream::nstream() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] += b[i] + scalar * c[i]; @@ -114,8 +134,8 @@ T STDRangesStream::dot() // sum += a[i] * b[i]; return std::transform_reduce( - std::execution::par_unseq, - a.begin(), a.end(), b.begin(), 0.0); + exe_policy, + BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -136,3 +156,5 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 890e893..33bc77b 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -21,9 +21,11 @@ class STDRangesStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: STDRangesStream(const int, int); diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index fd07387..ac56962 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -13,4 +17,7 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 9c34a50..bd94443 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -5,15 +5,37 @@ // source code #include "TBBStream.hpp" +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif { if(device != 0){ throw std::runtime_error("Device != 0 is not supported by TBB"); } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; } @@ -35,9 +57,9 @@ template void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template @@ -132,3 +154,5 @@ std::string getDeviceDriver(const int) template class TBBStream; template class TBBStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 90763a9..2744afc 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -40,10 +40,15 @@ class TBBStream : public Stream tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - +#ifdef USE_VECTOR + std::vector a, b, c; +#else + size_t array_size; + T *a, *b, *c; +#endif + + + public: TBBStream(const int, int); ~TBBStream() = default; diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index e4d6bac..c1ff9aa 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -1,7 +1,7 @@ register_flag_optional(ONE_TBB_DIR "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. - If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") @@ -15,15 +15,22 @@ register_flag_optional(PARTITIONER See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." "AUTO") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct endif() - + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro()