From 6c47b22cc3e6731792364fba8f913e3ef898e44c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 12 Nov 2021 12:38:54 +0000 Subject: [PATCH 1/9] Capture vectors by reference directly Add custom range implementation --- STDStream.cpp | 45 +++++++++++++++++++++++---------------------- STDStream.h | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 26 deletions(-) diff --git a/STDStream.cpp b/STDStream.cpp index 30ad420..536fd1c 100644 --- a/STDStream.cpp +++ b/STDStream.cpp @@ -14,62 +14,61 @@ // auto exe_policy = std::execution::par; auto exe_policy = std::execution::par_unseq; -template -STDStream::STDStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a{new T[array_size]}, b{new T[array_size]}, c{new T[array_size]} -{ -} template -STDStream::~STDStream() +STDStream::STDStream(const int ARRAY_SIZE, int device) + noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) { - delete[] a; - delete[] b; - delete[] c; } template void STDStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a, a+array_size, initA); - std::fill(exe_policy, b, b+array_size, initB); - std::fill(exe_policy, c, c+array_size, initC); + std::fill(exe_policy, a.begin(), a.end(), initA); + std::fill(exe_policy, b.begin(), b.end(), initB); + std::fill(exe_policy, c.begin(), c.end(), initC); } template void STDStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - std::copy(exe_policy, a, a+array_size, h_a.data()); - std::copy(exe_policy, b, b+array_size, h_b.data()); - std::copy(exe_policy, c, c+array_size, h_c.data()); + h_a = a; + h_b = b; + h_c = c; } template void STDStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a, a+array_size, c) ; + std::copy(exe_policy, a.begin(), a.end(), c.begin()); } template void STDStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; }); + std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + return scalar * c[i]; + }); } template void STDStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, a, a+array_size, b, c, std::plus()); + std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + return a[i] + b[i]; + }); } template void STDStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; }); + std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + return b[i] + scalar * c[i]; + }); } template @@ -79,15 +78,17 @@ void STDStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a, a+array_size, b, a, [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a, a+array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; }); + std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + return a[i] + b[i] + scalar * c[i]; + }); } + template T STDStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0); + return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); } void listDevices(void) diff --git a/STDStream.h b/STDStream.h index 2249812..b50bad1 100644 --- a/STDStream.h +++ b/STDStream.h @@ -11,6 +11,33 @@ #define IMPLEMENTATION_STRING "STD" +template +class ranged { + N from, to; +public: + ranged(N from, N to ): from(from), to(to) {} + class iterator { + N num; + public: + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = N&; + using iterator_category = std::random_access_iterator_tag; + explicit iterator(N _num = 0) : num(_num) {} + iterator& operator++() { num++; return *this; } + iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } + bool operator==(iterator other) const { return num == other.num; } + bool operator!=(iterator other) const { return *this != other; } + reference operator*() const { return num;} + difference_type operator-(const iterator &it) const { return num - it.num; } + value_type operator[](const difference_type &i) const { return num+i; } + + }; + iterator begin() { return iterator(from); } + iterator end() { return iterator(to >= from? to+1 : to-1); } +}; + template class STDStream : public Stream { @@ -18,14 +45,18 @@ class STDStream : public Stream // Size of arrays int array_size; + // induction range + ranged range; + // Device side pointers - T *a; - T *b; - T *c; + std::vector a; + std::vector b; + std::vector c; + public: STDStream(const int, int) noexcept; - ~STDStream(); + ~STDStream() = default; virtual void copy() override; virtual void add() override; From f9bba3c0de0035db88f0b6557435895938915af0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 15 Dec 2021 21:38:56 +0000 Subject: [PATCH 2/9] Split implementation for index/data oriented std variants Fix missing range iterator functions for std-indices Rename std20 to std-ranges --- CMakeLists.txt | 11 +- src/ci-test-compile.sh | 16 ++- src/main.cpp | 26 +++-- src/std-data/STDDataStream.cpp | 103 ++++++++++++++++++ src/std-data/STDDataStream.h | 42 +++++++ src/{std => std-data}/model.cmake | 0 .../STDIndicesStream.cpp} | 24 ++-- .../STDIndicesStream.h} | 21 ++-- src/std-indices/model.cmake | 33 ++++++ .../STDRangesStream.cpp} | 24 ++-- .../STDRangesStream.hpp} | 8 +- src/{std20 => std-ranges}/model.cmake | 0 12 files changed, 251 insertions(+), 57 deletions(-) create mode 100644 src/std-data/STDDataStream.cpp create mode 100644 src/std-data/STDDataStream.h rename src/{std => std-data}/model.cmake (100%) rename src/{std/STDStream.cpp => std-indices/STDIndicesStream.cpp} (79%) rename src/{std/STDStream.h => std-indices/STDIndicesStream.h} (80%) create mode 100644 src/std-indices/model.cmake rename src/{std20/STD20Stream.cpp => std-ranges/STDRangesStream.cpp} (79%) rename src/{std20/STD20Stream.hpp => std-ranges/STDRangesStream.hpp} (83%) rename src/{std20 => std-ranges}/model.cmake (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad12dbc..d47dcad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,9 +34,9 @@ endmacro() #set(CUDA_ARCH sm_70) #set(BLT_DIR /home/tom/Downloads/blt-0.3.6/) -#set(MODEL STD) -#set(ARCH cc70) -#set(CXX_EXTRA_FLAGS -v) +#set(MODEL std-data) +#set(CMAKE_CXX_COMPILER /home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/compilers/bin/nvc++) +#set(NVHPC_OFFLOAD "cuda11.4,cc61") #set(MODEL CUDA) #set(ARCH sm_70) @@ -117,8 +117,9 @@ include(cmake/register_models.cmake) # register out models register_model(omp OMP OMPStream.cpp) register_model(ocl OCL OCLStream.cpp) -register_model(std STD STDStream.cpp) -register_model(std20 STD20 STD20Stream.cpp) +register_model(std-data STD_DATA STDDataStream.cpp) +register_model(std-indices STD_INDICES STDIndicesStream.cpp) +register_model(std-ranges STD_RANGES STDRangesStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 3f54aaf..9388643 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -115,7 +115,7 @@ run_build() { # GCC_STD_PAR_LIB="tbb" # CLANG_STD_PAR_LIB="tbb" # GCC_OMP_OFFLOAD_AMD=false -# GCC_OMP_OFFLOAD_NVIDIA=true +# GCC_OMP_OFFLOAD_NVIDIA=false # CLANG_OMP_OFFLOAD_AMD=false # CLANG_OMP_OFFLOAD_NVIDIA=false ### @@ -136,8 +136,9 @@ build_gcc() { fi # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB @@ -211,7 +212,8 @@ build_clang() { run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" @@ -219,7 +221,8 @@ build_clang() { run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -232,7 +235,8 @@ build_clang() { build_nvhpc() { local name="nvhpc_build" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" - run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } diff --git a/src/main.cpp b/src/main.cpp index 5a01b74..13a0021 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -21,10 +21,12 @@ #if defined(CUDA) #include "CUDAStream.h" -#elif defined(STD) -#include "STDStream.h" -#elif defined(STD20) -#include "STD20Stream.hpp" +#elif defined(STD_DATA) +#include "STDDataStream.h" +#elif defined(STD_INDICES) +#include "STDIndicesStream.h" +#elif defined(STD_RANGES) +#include "STDRangesStream.hpp" #elif defined(TBB) #include "TBBStream.hpp" #elif defined(THRUST) @@ -264,13 +266,17 @@ void run() // Use the Kokkos implementation stream = new KokkosStream(ARRAY_SIZE, deviceIndex); -#elif defined(STD) - // Use the STD implementation - stream = new STDStream(ARRAY_SIZE, deviceIndex); +#elif defined(STD_DATA) + // Use the C++ STD data-oriented implementation + stream = new STDDataStream(ARRAY_SIZE, deviceIndex); -#elif defined(STD20) - // Use the C++20 implementation - stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(STD_INDICES) + // Use the C++ STD index-oriented implementation + stream = new STDIndicesStream(ARRAY_SIZE, deviceIndex); + +#elif defined(STD_RANGES) + // Use the C++ STD ranges implementation + stream = new STDRangesStream(ARRAY_SIZE, deviceIndex); #elif defined(TBB) // Use the C++20 implementation diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp new file mode 100644 index 0000000..64a37a4 --- /dev/null +++ b/src/std-data/STDDataStream.cpp @@ -0,0 +1,103 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "STDDataStream.h" + +#include +#include +#include + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +auto exe_policy = std::execution::par_unseq; + + +template +STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) + noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) +{ +} + +template +void STDDataStream::init_arrays(T initA, T initB, T initC) +{ + std::fill(exe_policy, a.begin(), a.end(), initA); + std::fill(exe_policy, b.begin(), b.end(), initB); + std::fill(exe_policy, c.begin(), c.end(), initC); +} + +template +void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + h_a = a; + h_b = b; + h_c = c; +} + +template +void STDDataStream::copy() +{ + // c[i] = a[i] + std::copy(exe_policy, a.begin(), a.end(), c.begin()); +} + +template +void STDDataStream::mul() +{ + // b[i] = scalar * c[i]; + std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); +} + +template +void STDDataStream::add() +{ + // c[i] = a[i] + b[i]; + std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus()); +} + +template +void STDDataStream::triad() +{ + // a[i] = b[i] + scalar * c[i]; + std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); +} + +template +void STDDataStream::nstream() +{ + // a[i] += b[i] + scalar * c[i]; + // Need to do in two stages with C++11 STL. + // 1: a[i] += b[i] + // 2: a[i] += scalar * c[i]; + std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); +} + + +template +T STDDataStream::dot() +{ + // sum = 0; sum += a[i]*b[i]; return sum; + return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class STDDataStream; +template class STDDataStream; + diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h new file mode 100644 index 0000000..7279070 --- /dev/null +++ b/src/std-data/STDDataStream.h @@ -0,0 +1,42 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "Stream.h" + +#define IMPLEMENTATION_STRING "STD (data-oriented)" + + +template +class STDDataStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + + public: + STDDataStream(const int, int) noexcept; + ~STDDataStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/src/std/model.cmake b/src/std-data/model.cmake similarity index 100% rename from src/std/model.cmake rename to src/std-data/model.cmake diff --git a/src/std/STDStream.cpp b/src/std-indices/STDIndicesStream.cpp similarity index 79% rename from src/std/STDStream.cpp rename to src/std-indices/STDIndicesStream.cpp index 536fd1c..e16fb91 100644 --- a/src/std/STDStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -3,7 +3,7 @@ // For full license terms please see the LICENSE file distributed with this // source code -#include "STDStream.h" +#include "STDIndicesStream.h" #include #include @@ -16,13 +16,13 @@ auto exe_policy = std::execution::par_unseq; template -STDStream::STDStream(const int ARRAY_SIZE, int device) +STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) { } template -void STDStream::init_arrays(T initA, T initB, T initC) +void STDIndicesStream::init_arrays(T initA, T initB, T initC) { std::fill(exe_policy, a.begin(), a.end(), initA); std::fill(exe_policy, b.begin(), b.end(), initB); @@ -30,7 +30,7 @@ void STDStream::init_arrays(T initA, T initB, T initC) } template -void STDStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { h_a = a; h_b = b; @@ -38,14 +38,14 @@ void STDStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve } template -void STDStream::copy() +void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, a.begin(), a.end(), c.begin()); } template -void STDStream::mul() +void STDIndicesStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { @@ -54,7 +54,7 @@ void STDStream::mul() } template -void STDStream::add() +void STDIndicesStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { @@ -63,7 +63,7 @@ void STDStream::add() } template -void STDStream::triad() +void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { @@ -72,7 +72,7 @@ void STDStream::triad() } template -void STDStream::nstream() +void STDIndicesStream::nstream() { // a[i] += b[i] + scalar * c[i]; // Need to do in two stages with C++11 STL. @@ -85,7 +85,7 @@ void STDStream::nstream() template -T STDStream::dot() +T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); @@ -105,6 +105,6 @@ std::string getDeviceDriver(const int) { return std::string("Device driver unavailable"); } -template class STDStream; -template class STDStream; +template class STDIndicesStream; +template class STDIndicesStream; diff --git a/src/std/STDStream.h b/src/std-indices/STDIndicesStream.h similarity index 80% rename from src/std/STDStream.h rename to src/std-indices/STDIndicesStream.h index b50bad1..87592a5 100644 --- a/src/std/STDStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -9,11 +9,11 @@ #include #include "Stream.h" -#define IMPLEMENTATION_STRING "STD" +#define IMPLEMENTATION_STRING "STD (index-oriented)" -template +template class ranged { - N from, to; + N from, to; public: ranged(N from, N to ): from(from), to(to) {} class iterator { @@ -22,16 +22,21 @@ public: using difference_type = N; using value_type = N; using pointer = const N*; - using reference = N&; + using reference = const N&; using iterator_category = std::random_access_iterator_tag; explicit iterator(N _num = 0) : num(_num) {} + iterator& operator++() { num++; return *this; } iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } + iterator operator+(const value_type v) const { return iterator(num + v); } + bool operator==(iterator other) const { return num == other.num; } bool operator!=(iterator other) const { return *this != other; } + bool operator<(iterator other) const { return num < other.num; } + reference operator*() const { return num;} difference_type operator-(const iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num+i; } + value_type operator[](const difference_type &i) const { return num + i; } }; iterator begin() { return iterator(from); } @@ -39,7 +44,7 @@ public: }; template -class STDStream : public Stream +class STDIndicesStream : public Stream { protected: // Size of arrays @@ -55,8 +60,8 @@ class STDStream : public Stream public: - STDStream(const int, int) noexcept; - ~STDStream() = default; + STDIndicesStream(const int, int) noexcept; + ~STDIndicesStream() = default; virtual void copy() override; virtual void add() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake new file mode 100644 index 0000000..ef69f30 --- /dev/null +++ b/src/std-indices/model.cmake @@ -0,0 +1,33 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection" + "c++") + +register_flag_optional(NVHPC_OFFLOAD + "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. + The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) + + Possible values are: + cc35 - Compile for compute capability 3.5 + cc50 - Compile for compute capability 5.0 + cc60 - Compile for compute capability 6.0 + cc62 - Compile for compute capability 6.2 + cc70 - Compile for compute capability 7.0 + cc72 - Compile for compute capability 7.2 + cc75 - Compile for compute capability 7.5 + cc80 - Compile for compute capability 8.0 + ccall - Compile for all supported compute capabilities" + "") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + if (NVHPC_OFFLOAD) + set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY ${NVHPC_FLAGS}) + register_append_link_flags(${NVHPC_FLAGS}) + endif () + + +endmacro() diff --git a/src/std20/STD20Stream.cpp b/src/std-ranges/STDRangesStream.cpp similarity index 79% rename from src/std20/STD20Stream.cpp rename to src/std-ranges/STDRangesStream.cpp index 8290033..de61528 100644 --- a/src/std20/STD20Stream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -4,14 +4,14 @@ // For full license terms please see the LICENSE file distributed with this // source code -#include "STD20Stream.hpp" +#include "STDRangesStream.hpp" #include #include #include template -STD20Stream::STD20Stream(const int ARRAY_SIZE, int device) +STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) : array_size{ARRAY_SIZE} { a = std::vector(array_size); @@ -20,7 +20,7 @@ STD20Stream::STD20Stream(const int ARRAY_SIZE, int device) } template -void STD20Stream::init_arrays(T initA, T initB, T initC) +void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( std::execution::par_unseq, @@ -34,7 +34,7 @@ void STD20Stream::init_arrays(T initA, T initB, T initC) } template -void STD20Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. h_a = a; @@ -43,7 +43,7 @@ void STD20Stream::read_arrays(std::vector& h_a, std::vector& h_b, std:: } template -void STD20Stream::copy() +void STDRangesStream::copy() { std::for_each_n( std::execution::par_unseq, @@ -55,7 +55,7 @@ void STD20Stream::copy() } template -void STD20Stream::mul() +void STDRangesStream::mul() { const T scalar = startScalar; @@ -69,7 +69,7 @@ void STD20Stream::mul() } template -void STD20Stream::add() +void STDRangesStream::add() { std::for_each_n( std::execution::par_unseq, @@ -81,7 +81,7 @@ void STD20Stream::add() } template -void STD20Stream::triad() +void STDRangesStream::triad() { const T scalar = startScalar; @@ -95,7 +95,7 @@ void STD20Stream::triad() } template -void STD20Stream::nstream() +void STDRangesStream::nstream() { const T scalar = startScalar; @@ -109,7 +109,7 @@ void STD20Stream::nstream() } template -T STD20Stream::dot() +T STDRangesStream::dot() { // sum += a[i] * b[i]; return @@ -133,6 +133,6 @@ std::string getDeviceDriver(const int) return std::string("Device driver unavailable"); } -template class STD20Stream; -template class STD20Stream; +template class STDRangesStream; +template class STDRangesStream; diff --git a/src/std20/STD20Stream.hpp b/src/std-ranges/STDRangesStream.hpp similarity index 83% rename from src/std20/STD20Stream.hpp rename to src/std-ranges/STDRangesStream.hpp index e5daa3c..890e893 100644 --- a/src/std20/STD20Stream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -11,10 +11,10 @@ #include "Stream.h" -#define IMPLEMENTATION_STRING "C++20" +#define IMPLEMENTATION_STRING "STD C++ ranges" template -class STD20Stream : public Stream +class STDRangesStream : public Stream { protected: // Size of arrays @@ -26,8 +26,8 @@ class STD20Stream : public Stream std::vector c; public: - STD20Stream(const int, int); - ~STD20Stream() = default; + STDRangesStream(const int, int); + ~STDRangesStream() = default; virtual void copy() override; virtual void add() override; diff --git a/src/std20/model.cmake b/src/std-ranges/model.cmake similarity index 100% rename from src/std20/model.cmake rename to src/std-ranges/model.cmake From fbd2e1bdc1b040db645f771b5d0efa7c01e2741c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 22 Dec 2021 09:10:41 +0000 Subject: [PATCH 3/9] Add comments for custom range iterator Remove debug leftovers from CMakeList Update copyright --- CMakeLists.txt | 40 +--------------------------- src/std-data/STDDataStream.cpp | 3 ++- src/std-data/STDDataStream.h | 3 ++- src/std-indices/STDIndicesStream.cpp | 3 ++- src/std-indices/STDIndicesStream.h | 7 ++++- 5 files changed, 13 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d47dcad..3730602 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(BabelStream VERSION 3.5 LANGUAGES CXX) +# uncomment for debugging build issues: #set(CMAKE_VERBOSE_MAKEFILE ON) # some nicer defaults for standard C++ @@ -23,45 +24,6 @@ macro(setup_opencl_header_includes) endif () endmacro() -#set(MODEL SYCL) -#set(SYCL_COMPILER COMPUTECPP) -#set(SYCL_COMPILER_DIR /home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu) -#set(MODEL RAJA) -#set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.13.0/) -#set(ENABLE_CUDA ON) -#set(TARGET NVIDIA) -#set(CUDA_TOOLKIT_ROOT_DIR /opt/cuda-11.2) -#set(CUDA_ARCH sm_70) -#set(BLT_DIR /home/tom/Downloads/blt-0.3.6/) - -#set(MODEL std-data) -#set(CMAKE_CXX_COMPILER /home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/compilers/bin/nvc++) -#set(NVHPC_OFFLOAD "cuda11.4,cc61") - -#set(MODEL CUDA) -#set(ARCH sm_70) -#set(CMAKE_CUDA_COMPILER /opt/cuda-11.2/bin/nvcc) - -#set(MODEL OCL) -#set(OpenCL_LIBRARY /opt/rocm-4.0.0/opencl/lib/libOpenCL.so) -#set(OpenCL_INCLUDE_DIR /opt/rocm-4.0.0/opencl/lib) -#set(RELEASE_FLAGS -Ofast) -#set(CXX_EXTRA_FLAGS -O2) - -#set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++) -#set(MODEL omp) -##set(OFFLOAD "AMD:gfx803") -#set(OFFLOAD "NVIDIA:sm_35") -#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/) - -#set(OFFLOAD "AMD:_70") -#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/ --gcc-toolchain=/home/tom/spack/opt/spack/linux-fedora33-zen2/gcc-10.2.1/gcc-8.3.0-latmjo2hl2yv53255xkwko7k3y7bx2vv) -#set(CXX_EXTRA_LINKER_FLAGS ) -#set(MODEL HIP) - -#set(MODEL KOKKOS) -#set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.3.00/) - # the final executable name set(EXE_NAME babelstream) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 64a37a4..a2628e1 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -1,4 +1,5 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 7279070..f8bc302 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -1,4 +1,5 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index e16fb91..a88dd18 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -1,4 +1,5 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 87592a5..66c8bb0 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -1,4 +1,5 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code @@ -11,6 +12,10 @@ #define IMPLEMENTATION_STRING "STD (index-oriented)" + +// A lightweight counting iterator which will be used by the STL algorithms +// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this +// implementation doesn't target template class ranged { N from, to; From 6ea09a9620674d1bf2626cacda582b7ec6256ee4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 22 Dec 2021 09:14:46 +0000 Subject: [PATCH 4/9] Add oneDPL support for std-data and std-indices --- CMakeLists.txt | 2 +- src/std-data/STDDataStream.cpp | 41 +++++++---- src/std-data/STDDataStream.h | 52 ++++++++++++-- src/std-data/model.cmake | 50 +++++++++++++ src/std-indices/STDIndicesStream.cpp | 51 +++++++++----- src/std-indices/STDIndicesStream.h | 102 ++++++++++++++++++--------- src/std-indices/model.cmake | 50 +++++++++++++ 7 files changed, 274 insertions(+), 74 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3730602..edf05f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(BabelStream VERSION 3.5 LANGUAGES CXX) # uncomment for debugging build issues: -#set(CMAKE_VERBOSE_MAKEFILE ON) +set(CMAKE_VERBOSE_MAKEFILE ON) # some nicer defaults for standard C++ set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index a2628e1..0be23a3 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -5,21 +5,31 @@ // source code #include "STDDataStream.h" - -#include -#include -#include - -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +#include template -STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) +STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) : array_size{ARRAY_SIZE}, +#if defined(ONEDPL_USE_DPCPP_BACKEND) + exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})), + allocator(exe_policy.queue()), + a(array_size, allocator), b(array_size, allocator), c(array_size, allocator) +#else + a(array_size), b(array_size),c(array_size) +#endif { +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; + #if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; + #elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB"; + #elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; + #else + std::cout << "Default"; + #endif + std::cout << std::endl; +#endif } template @@ -33,9 +43,10 @@ void STDDataStream::init_arrays(T initA, T initB, T initC) template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + // operator = is deleted because h_* vectors may have different allocator type compared to ours + std::copy(a.begin(), a.end(), h_a.begin()); + std::copy(b.begin(), b.end(), h_b.begin()); + std::copy(c.begin(), c.end(), h_c.begin()); } template diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index f8bc302..c9a1f43 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -6,12 +6,35 @@ #pragma once -#include -#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD (data-oriented)" +#ifdef USE_ONEDPL + #define PSTL_USAGE_WARNINGS 1 + + #include + #include + #include + #include + #include + + #ifdef ONEDPL_USE_DPCPP_BACKEND + #include + #endif +#else + #include + #include + #include + + #if defined(ONEDPL_USE_DPCPP_BACKEND) || \ + defined(ONEDPL_USE_TBB_BACKEND) || \ + defined(ONEDPL_USE_OPENMP_BACKEND) + #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified + #endif + +#endif + template class STDDataStream : public Stream @@ -20,14 +43,31 @@ class STDDataStream : public Stream // Size of arrays int array_size; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + // SYCL oneDPL backend + using ExecutionPolicy = oneapi::dpl::execution::device_policy<>; + using Allocator = sycl::usm_allocator; +#elif defined(USE_ONEDPL) + // every other non-SYCL oneDPL backend (i.e TBB, OMP) + using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq); + using Allocator = std::allocator; +#else + // normal std execution policies + using ExecutionPolicy = decltype(std::execution::par_unseq); + using Allocator = std::allocator; +#endif + + ExecutionPolicy exe_policy{}; + Allocator allocator; + // Device side pointers - std::vector a; - std::vector b; - std::vector c; + std::vector a; + std::vector b; + std::vector c; public: - STDDataStream(const int, int) noexcept; + STDDataStream(const int, int); ~STDDataStream() = default; virtual void copy() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index ef69f30..2454295 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -19,9 +19,30 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(ONEDPL_OFFLOAD + "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP. + This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp + compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option. + Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb. + The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/). + This should be done by oneAPI's `setvars.sh` script automatically. + + Possible values are: + TBB - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically. + OMP - Use the OpenMP backend + DPCPP - Use the SYCL (via dpcpp) backend with the default selector. + See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter + on selecting a non-default device or SYCL backend." + "") + + macro(setup) set(CMAKE_CXX_STANDARD 17) + if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD) + message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive") + endif () + if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -30,4 +51,33 @@ macro(setup) endif () + if (ONEDPL_OFFLOAD) + set(CXX_EXTRA_FLAGS) + set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl) + + # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html + # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation + register_definitions( + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 + ) + + register_definitions(USE_ONEDPL) + if (ONEDPL_OFFLOAD STREQUAL "TBB") + register_definitions(ONEDPL_USE_TBB_BACKEND=1) + elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP") + register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) + elseif (ONEDPL_OFFLOAD STREQUAL "SYCL") + register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) + else () + message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}") + endif () + + # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here + register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so) + + endif () + endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index a88dd18..ceb9d3d 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -5,21 +5,32 @@ // source code #include "STDIndicesStream.h" - -#include -#include -#include - -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +#include template -STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) +STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) : +array_size{ARRAY_SIZE}, range_start(0), range_end(array_size), +#if defined(ONEDPL_USE_DPCPP_BACKEND) +exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})), + allocator(exe_policy.queue()), + a(array_size, allocator), b(array_size, allocator), c(array_size, allocator) +#else +a(array_size), b(array_size),c(array_size) +#endif { +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; + #if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; + #elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB"; + #elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; + #else + std::cout << "Default"; + #endif + std::cout << std::endl; +#endif } template @@ -33,11 +44,13 @@ void STDIndicesStream::init_arrays(T initA, T initB, T initC) template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + // operator = is deleted because h_* vectors may have different allocator type compared to ours + std::copy(a.begin(), a.end(), h_a.begin()); + std::copy(b.begin(), b.end(), h_b.begin()); + std::copy(c.begin(), c.end(), h_c.begin()); } + template void STDIndicesStream::copy() { @@ -49,7 +62,7 @@ template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range_start, range_end, b.begin(), [&, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -58,7 +71,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + std::transform(exe_policy, range_start, range_end, c.begin(), [&](int i) { return a[i] + b[i]; }); } @@ -67,7 +80,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -79,7 +92,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 66c8bb0..dfb63f6 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -6,46 +6,62 @@ #pragma once -#include -#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD (index-oriented)" + +#ifdef USE_ONEDPL + #define PSTL_USAGE_WARNINGS 1 + + #include + #include + #include + #include + #include + + #ifdef ONEDPL_USE_DPCPP_BACKEND + #include + #endif +#else + #include + #include + #include + + #if defined(ONEDPL_USE_DPCPP_BACKEND) || \ + defined(ONEDPL_USE_TBB_BACKEND) || \ + defined(ONEDPL_USE_OPENMP_BACKEND) + #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified + #endif + +#endif + // A lightweight counting iterator which will be used by the STL algorithms // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // implementation doesn't target template -class ranged { - N from, to; +class ranged_iterator { + N num; public: - ranged(N from, N to ): from(from), to(to) {} - class iterator { - N num; - public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = const N&; - using iterator_category = std::random_access_iterator_tag; - explicit iterator(N _num = 0) : num(_num) {} + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = const N&; + using iterator_category = std::random_access_iterator_tag; + explicit ranged_iterator(N _num = 0) : num(_num) {} - iterator& operator++() { num++; return *this; } - iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } - iterator operator+(const value_type v) const { return iterator(num + v); } + ranged_iterator& operator++() { num++; return *this; } + ranged_iterator operator++(int) { ranged_iterator retval = *this; ++(*this); return retval; } + ranged_iterator operator+(const value_type v) const { return ranged_iterator(num + v); } - bool operator==(iterator other) const { return num == other.num; } - bool operator!=(iterator other) const { return *this != other; } - bool operator<(iterator other) const { return num < other.num; } + bool operator==(ranged_iterator other) const { return num == other.num; } + bool operator!=(ranged_iterator other) const { return *this != other; } + bool operator<(ranged_iterator other) const { return num < other.num; } - reference operator*() const { return num;} - difference_type operator-(const iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num + i; } - - }; - iterator begin() { return iterator(from); } - iterator end() { return iterator(to >= from? to+1 : to-1); } + reference operator*() const { return num;} + difference_type operator-(const ranged_iterator &it) const { return num - it.num; } + value_type operator[](const difference_type &i) const { return num + i; } }; template @@ -55,17 +71,37 @@ class STDIndicesStream : public Stream // Size of arrays int array_size; - // induction range - ranged range; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + // SYCL oneDPL backend + using ExecutionPolicy = oneapi::dpl::execution::device_policy<>; + using Allocator = sycl::usm_allocator; + using IteratorType = oneapi::dpl::counting_iterator; +#elif defined(USE_ONEDPL) + // every other non-SYCL oneDPL backend (i.e TBB, OMP) + using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq); + using Allocator = std::allocator; + using IteratorType = oneapi::dpl::counting_iterator; +#else + // normal std execution policies + using ExecutionPolicy = decltype(std::execution::par_unseq); + using Allocator = std::allocator; + using IteratorType = ranged_iterator; +#endif + + IteratorType range_start; + IteratorType range_end; + + ExecutionPolicy exe_policy{}; + Allocator allocator; // Device side pointers - std::vector a; - std::vector b; - std::vector c; + std::vector a; + std::vector b; + std::vector c; public: - STDIndicesStream(const int, int) noexcept; + STDIndicesStream(const int, int); ~STDIndicesStream() = default; virtual void copy() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index ef69f30..2454295 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -19,9 +19,30 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(ONEDPL_OFFLOAD + "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP. + This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp + compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option. + Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb. + The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/). + This should be done by oneAPI's `setvars.sh` script automatically. + + Possible values are: + TBB - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically. + OMP - Use the OpenMP backend + DPCPP - Use the SYCL (via dpcpp) backend with the default selector. + See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter + on selecting a non-default device or SYCL backend." + "") + + macro(setup) set(CMAKE_CXX_STANDARD 17) + if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD) + message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive") + endif () + if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -30,4 +51,33 @@ macro(setup) endif () + if (ONEDPL_OFFLOAD) + set(CXX_EXTRA_FLAGS) + set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl) + + # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html + # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation + register_definitions( + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 + ) + + register_definitions(USE_ONEDPL) + if (ONEDPL_OFFLOAD STREQUAL "TBB") + register_definitions(ONEDPL_USE_TBB_BACKEND=1) + elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP") + register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) + elseif (ONEDPL_OFFLOAD STREQUAL "SYCL") + register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) + else () + message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}") + endif () + + # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here + register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so) + + endif () + endmacro() From 2cd52228b733368e8cd57075f444c53f113c1f1a Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 22 Dec 2021 12:20:33 +0000 Subject: [PATCH 5/9] Revert "Add oneDPL support for std-data and std-indices" This reverts commit 6ea09a9620674d1bf2626cacda582b7ec6256ee4. --- CMakeLists.txt | 2 +- src/std-data/STDDataStream.cpp | 41 ++++------- src/std-data/STDDataStream.h | 52 ++------------ src/std-data/model.cmake | 50 ------------- src/std-indices/STDIndicesStream.cpp | 51 +++++--------- src/std-indices/STDIndicesStream.h | 102 +++++++++------------------ src/std-indices/model.cmake | 50 ------------- 7 files changed, 74 insertions(+), 274 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index edf05f5..3730602 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(BabelStream VERSION 3.5 LANGUAGES CXX) # uncomment for debugging build issues: -set(CMAKE_VERBOSE_MAKEFILE ON) +#set(CMAKE_VERBOSE_MAKEFILE ON) # some nicer defaults for standard C++ set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 0be23a3..a2628e1 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -5,31 +5,21 @@ // source code #include "STDDataStream.h" -#include + +#include +#include +#include + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +auto exe_policy = std::execution::par_unseq; + template -STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) : array_size{ARRAY_SIZE}, -#if defined(ONEDPL_USE_DPCPP_BACKEND) - exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})), - allocator(exe_policy.queue()), - a(array_size, allocator), b(array_size, allocator), c(array_size, allocator) -#else - a(array_size), b(array_size),c(array_size) -#endif +STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) + noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { -#if USE_ONEDPL - std::cout << "Using oneDPL backend: "; - #if defined(ONEDPL_USE_DPCPP_BACKEND) - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; - #elif defined(ONEDPL_USE_TBB_BACKEND) - std::cout << "TBB"; - #elif defined(ONEDPL_USE_OPENMP_BACKEND) - std::cout << "OpenMP"; - #else - std::cout << "Default"; - #endif - std::cout << std::endl; -#endif } template @@ -43,10 +33,9 @@ void STDDataStream::init_arrays(T initA, T initB, T initC) template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - // operator = is deleted because h_* vectors may have different allocator type compared to ours - std::copy(a.begin(), a.end(), h_a.begin()); - std::copy(b.begin(), b.end(), h_b.begin()); - std::copy(c.begin(), c.end(), h_c.begin()); + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index c9a1f43..f8bc302 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -6,35 +6,12 @@ #pragma once +#include +#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD (data-oriented)" -#ifdef USE_ONEDPL - #define PSTL_USAGE_WARNINGS 1 - - #include - #include - #include - #include - #include - - #ifdef ONEDPL_USE_DPCPP_BACKEND - #include - #endif -#else - #include - #include - #include - - #if defined(ONEDPL_USE_DPCPP_BACKEND) || \ - defined(ONEDPL_USE_TBB_BACKEND) || \ - defined(ONEDPL_USE_OPENMP_BACKEND) - #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified - #endif - -#endif - template class STDDataStream : public Stream @@ -43,31 +20,14 @@ class STDDataStream : public Stream // Size of arrays int array_size; -#if defined(ONEDPL_USE_DPCPP_BACKEND) - // SYCL oneDPL backend - using ExecutionPolicy = oneapi::dpl::execution::device_policy<>; - using Allocator = sycl::usm_allocator; -#elif defined(USE_ONEDPL) - // every other non-SYCL oneDPL backend (i.e TBB, OMP) - using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq); - using Allocator = std::allocator; -#else - // normal std execution policies - using ExecutionPolicy = decltype(std::execution::par_unseq); - using Allocator = std::allocator; -#endif - - ExecutionPolicy exe_policy{}; - Allocator allocator; - // Device side pointers - std::vector a; - std::vector b; - std::vector c; + std::vector a; + std::vector b; + std::vector c; public: - STDDataStream(const int, int); + STDDataStream(const int, int) noexcept; ~STDDataStream() = default; virtual void copy() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 2454295..ef69f30 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -19,30 +19,9 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") -register_flag_optional(ONEDPL_OFFLOAD - "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP. - This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp - compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option. - Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb. - The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/). - This should be done by oneAPI's `setvars.sh` script automatically. - - Possible values are: - TBB - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically. - OMP - Use the OpenMP backend - DPCPP - Use the SYCL (via dpcpp) backend with the default selector. - See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter - on selecting a non-default device or SYCL backend." - "") - - macro(setup) set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD) - message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive") - endif () - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -51,33 +30,4 @@ macro(setup) endif () - if (ONEDPL_OFFLOAD) - set(CXX_EXTRA_FLAGS) - set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl) - - # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html - # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation - register_definitions( - PSTL_USE_PARALLEL_POLICIES=0 - _GLIBCXX_USE_TBB_PAR_BACKEND=0 - ) - - register_definitions(USE_ONEDPL) - if (ONEDPL_OFFLOAD STREQUAL "TBB") - register_definitions(ONEDPL_USE_TBB_BACKEND=1) - elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP") - register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) - elseif (ONEDPL_OFFLOAD STREQUAL "SYCL") - register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) - else () - message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}") - endif () - - # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here - register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so) - - endif () - endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index ceb9d3d..a88dd18 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -5,32 +5,21 @@ // source code #include "STDIndicesStream.h" -#include + +#include +#include +#include + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +auto exe_policy = std::execution::par_unseq; + template -STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) : -array_size{ARRAY_SIZE}, range_start(0), range_end(array_size), -#if defined(ONEDPL_USE_DPCPP_BACKEND) -exe_policy(oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})), - allocator(exe_policy.queue()), - a(array_size, allocator), b(array_size, allocator), c(array_size, allocator) -#else -a(array_size), b(array_size),c(array_size) -#endif +STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) + noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) { -#if USE_ONEDPL - std::cout << "Using oneDPL backend: "; - #if defined(ONEDPL_USE_DPCPP_BACKEND) - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; - #elif defined(ONEDPL_USE_TBB_BACKEND) - std::cout << "TBB"; - #elif defined(ONEDPL_USE_OPENMP_BACKEND) - std::cout << "OpenMP"; - #else - std::cout << "Default"; - #endif - std::cout << std::endl; -#endif } template @@ -44,13 +33,11 @@ void STDIndicesStream::init_arrays(T initA, T initB, T initC) template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - // operator = is deleted because h_* vectors may have different allocator type compared to ours - std::copy(a.begin(), a.end(), h_a.begin()); - std::copy(b.begin(), b.end(), h_b.begin()); - std::copy(c.begin(), c.end(), h_c.begin()); + h_a = a; + h_b = b; + h_c = c; } - template void STDIndicesStream::copy() { @@ -62,7 +49,7 @@ template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range_start, range_end, b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -71,7 +58,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range_start, range_end, c.begin(), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { return a[i] + b[i]; }); } @@ -80,7 +67,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -92,7 +79,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range_start, range_end, a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index dfb63f6..66c8bb0 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -6,62 +6,46 @@ #pragma once +#include +#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD (index-oriented)" - -#ifdef USE_ONEDPL - #define PSTL_USAGE_WARNINGS 1 - - #include - #include - #include - #include - #include - - #ifdef ONEDPL_USE_DPCPP_BACKEND - #include - #endif -#else - #include - #include - #include - - #if defined(ONEDPL_USE_DPCPP_BACKEND) || \ - defined(ONEDPL_USE_TBB_BACKEND) || \ - defined(ONEDPL_USE_OPENMP_BACKEND) - #error oneDPL missing (ONEDPL_VERSION_MAJOR not defined) but backend (ONEDPL_USE_*_BACKEND) specified - #endif - -#endif - // A lightweight counting iterator which will be used by the STL algorithms // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // implementation doesn't target template -class ranged_iterator { - N num; +class ranged { + N from, to; public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = const N&; - using iterator_category = std::random_access_iterator_tag; - explicit ranged_iterator(N _num = 0) : num(_num) {} + ranged(N from, N to ): from(from), to(to) {} + class iterator { + N num; + public: + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = const N&; + using iterator_category = std::random_access_iterator_tag; + explicit iterator(N _num = 0) : num(_num) {} - ranged_iterator& operator++() { num++; return *this; } - ranged_iterator operator++(int) { ranged_iterator retval = *this; ++(*this); return retval; } - ranged_iterator operator+(const value_type v) const { return ranged_iterator(num + v); } + iterator& operator++() { num++; return *this; } + iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } + iterator operator+(const value_type v) const { return iterator(num + v); } - bool operator==(ranged_iterator other) const { return num == other.num; } - bool operator!=(ranged_iterator other) const { return *this != other; } - bool operator<(ranged_iterator other) const { return num < other.num; } + bool operator==(iterator other) const { return num == other.num; } + bool operator!=(iterator other) const { return *this != other; } + bool operator<(iterator other) const { return num < other.num; } - reference operator*() const { return num;} - difference_type operator-(const ranged_iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num + i; } + reference operator*() const { return num;} + difference_type operator-(const iterator &it) const { return num - it.num; } + value_type operator[](const difference_type &i) const { return num + i; } + + }; + iterator begin() { return iterator(from); } + iterator end() { return iterator(to >= from? to+1 : to-1); } }; template @@ -71,37 +55,17 @@ class STDIndicesStream : public Stream // Size of arrays int array_size; -#if defined(ONEDPL_USE_DPCPP_BACKEND) - // SYCL oneDPL backend - using ExecutionPolicy = oneapi::dpl::execution::device_policy<>; - using Allocator = sycl::usm_allocator; - using IteratorType = oneapi::dpl::counting_iterator; -#elif defined(USE_ONEDPL) - // every other non-SYCL oneDPL backend (i.e TBB, OMP) - using ExecutionPolicy = decltype(oneapi::dpl::execution::par_unseq); - using Allocator = std::allocator; - using IteratorType = oneapi::dpl::counting_iterator; -#else - // normal std execution policies - using ExecutionPolicy = decltype(std::execution::par_unseq); - using Allocator = std::allocator; - using IteratorType = ranged_iterator; -#endif - - IteratorType range_start; - IteratorType range_end; - - ExecutionPolicy exe_policy{}; - Allocator allocator; + // induction range + ranged range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; + std::vector a; + std::vector b; + std::vector c; public: - STDIndicesStream(const int, int); + STDIndicesStream(const int, int) noexcept; ~STDIndicesStream() = default; virtual void copy() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 2454295..ef69f30 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -19,30 +19,9 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") -register_flag_optional(ONEDPL_OFFLOAD - "Use the DPC++ oneDPL library which supports STL algorithms on SYCL, TBB, and OpenMP. - This option only supports the oneDPL library shipped with oneAPI, and must use the dpcpp - compiler (i.e -DCMAKE_CXX_COMPILER=dpcpp) for this option. - Make sure your oneAPI installation includes at least the following components: dpcpp, onedpl, onetbb. - The env. variable `TBBROOT` needs to point to the base directory of your TBB install (e.g /opt/intel/oneapi/tbb/latest/). - This should be done by oneAPI's `setvars.sh` script automatically. - - Possible values are: - TBB - Use the TBB backend, the correct TBB library will be linked from oneAPI automatically. - OMP - Use the OpenMP backend - DPCPP - Use the SYCL (via dpcpp) backend with the default selector. - See https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-device-filter - on selecting a non-default device or SYCL backend." - "") - - macro(setup) set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD AND ONEDPL_OFFLOAD) - message(FATAL_ERROR "NVHPC_OFFLOAD and NVHPC_OFFLOAD are mutually exclusive") - endif () - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -51,33 +30,4 @@ macro(setup) endif () - if (ONEDPL_OFFLOAD) - set(CXX_EXTRA_FLAGS) - set(CXX_EXTRA_LIBRARIES /opt/intel/oneapi/tbb/2021.4.0/lib/intel64/gcc4.8/libtbb.so) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY -fopenmp -fsycl-unnamed-lambda -fsycl) - - # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html - # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation - register_definitions( - PSTL_USE_PARALLEL_POLICIES=0 - _GLIBCXX_USE_TBB_PAR_BACKEND=0 - ) - - register_definitions(USE_ONEDPL) - if (ONEDPL_OFFLOAD STREQUAL "TBB") - register_definitions(ONEDPL_USE_TBB_BACKEND=1) - elseif (ONEDPL_OFFLOAD STREQUAL "OPENMP") - register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) - elseif (ONEDPL_OFFLOAD STREQUAL "SYCL") - register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) - else () - message(FATAL_ERROR "Unsupported ONEDPL_OFFLOAD backend: ${ONEDPL_OFFLOAD}") - endif () - - # even with the workaround above, -ltbb may still end up with the wrong one, so be explicit here - register_link_library($ENV{TBBROOT}/lib/intel64/gcc4.8/libtbb.so) - - endif () - endmacro() From 0b51c76af991abcf1a4f3ffd6aa54d27ee850f01 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 22 Dec 2021 12:28:28 +0000 Subject: [PATCH 6/9] Update src/std-data/STDDataStream.h --- src/std-data/STDDataStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index f8bc302..82dae7d 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020 Tom Deakin +// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this From 91e83067a336bb0d43add962050251222befb06f Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 22 Dec 2021 12:28:54 +0000 Subject: [PATCH 7/9] Update src/std-data/STDDataStream.h --- src/std-data/STDDataStream.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 82dae7d..1c15b7c 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -1,5 +1,4 @@ // Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. -// University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code From fe06ac03b0b50fee685ff37cb984771a8c017318 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 22 Dec 2021 12:31:29 +0000 Subject: [PATCH 8/9] Update src/std-data/STDDataStream.cpp --- src/std-data/STDDataStream.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index a2628e1..2261017 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -1,4 +1,5 @@ -// Copyright (c) 2020 Tom Deakin +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Updated in 2021 by University of Bristol // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this From e620a7ac36114513e7ffa18b9e0b4d019a763d3a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 22 Dec 2021 12:33:49 +0000 Subject: [PATCH 9/9] ammend copyright headers --- src/std-data/STDDataStream.cpp | 3 +-- src/std-data/STDDataStream.h | 1 + src/std-indices/STDIndicesStream.cpp | 2 +- src/std-indices/STDIndicesStream.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 2261017..343e247 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -1,6 +1,5 @@ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -// Updated in 2021 by University of Bristol -// University of Bristol HPC +// Updated 2021 by University of Bristol // // For full license terms please see the LICENSE file distributed with this // source code diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 1c15b7c..741fd6c 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -1,4 +1,5 @@ // Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +// Updated 2021 by University of Bristol // // For full license terms please see the LICENSE file distributed with this // source code diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index a88dd18..2221f90 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2020 Tom Deakin +// Copyright (c) 2021 Tom Deakin and Tom Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 66c8bb0..bc068aa 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020 Tom Deakin +// Copyright (c) 2021 Tom Deakin and Tom Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this