From 2eca3974e64c2517a6c35a8161d5574677dafc77 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 21 Apr 2021 16:28:12 +0100 Subject: [PATCH 01/14] Disable CI for RAJA on gcc-10+CUDA due to ICE Update changelog to include RAJA 0.13.x --- CHANGELOG.md | 1 + ci-test-compile.sh | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19c2a6d..29702d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ All notable changes to this project will be documented in this file. - Unified run function in driver code to reduce code duplication, output should be uneffected. - Normalise sum result by expected value to help false negative errors. - HC version deprecated and moved to a legacy directory. +- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update). ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 46046c4..696f5cd 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -146,11 +146,15 @@ build_gcc() { run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ - -DENABLE_CUDA=ON \ - -DTARGET=NVIDIA \ - -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ - -DCUDA_ARCH=$NV_ARCH" + +# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 +# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 + +# run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ +# -DENABLE_CUDA=ON \ +# -DTARGET=NVIDIA \ +# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ +# -DCUDA_ARCH=$NV_ARCH" } From 75a4394830d209614c3026ed4b20efd2bf6baffa Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Mon, 19 Apr 2021 18:55:35 +0200 Subject: [PATCH 02/14] Include stdlib.h for aligned_alloc Silence "error: there are no arguments to 'aligned_alloc' that depend on a template parameter, so a declaration of 'aligned_alloc' must be available" * OMPStream.cpp: #include . * RAJAStream.cpp: Likewise. --- OMPStream.cpp | 1 + RAJAStream.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/OMPStream.cpp b/OMPStream.cpp index 8063987..0cd8035 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -5,6 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code +#include // For aligned_alloc #include "OMPStream.h" #ifndef ALIGNMENT diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 44db5ed..d271ea4 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -5,6 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code +#include // For aligned_alloc #include #include "RAJAStream.hpp" From cc16547e4defe8279ea8bf2f96b6f4f36bb61c7a Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 10 May 2021 17:50:36 +0100 Subject: [PATCH 03/14] Add install target for CMake --- CMakeLists.txt | 2 ++ ci-test-compile.sh | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17669a3..d4a11cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,3 +188,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) if (COMMAND setup_target) setup_target(${EXE_NAME}) endif () + +install (TARGETS ${EXE_NAME} DESTINATION bin) \ No newline at end of file diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 696f5cd..1b5c1bb 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -44,21 +44,26 @@ run_build() { rm -rf "$build" set +e + local install_dir="$build/install" # shellcheck disable=SC2086 "$CMAKE_BIN" -B"$build" -H. \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_INSTALL_PREFIX="$install_dir" \ -DMODEL="$model" $flags &>>"$log" local model_lower=$(echo "$model" | awk '{print tolower($0)}') local cmake_code=$? "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" + "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" local cmake_code=$? set -e local bin="./$build/$model_lower-stream" + local installed_bin="./$install_dir/bin/$model_lower-stream" + echo "Checking for final executable: $bin" if [[ -f "$bin" ]]; then echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" @@ -66,6 +71,11 @@ run_build() { cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/" + if [[ ! -f "$installed_bin" ]]; then + echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)" + cat "$log" + exit 1 + fi else echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)" From e20aecd845fd8736e2a7905f17483c104dd9651e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 17 May 2021 15:25:43 +0100 Subject: [PATCH 04/14] [SYCL 1.2.1] Add check for FP64 support Fixes #98 --- SYCLStream.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 49ad3ac..00c043f 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -28,6 +28,14 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (dev.get_info().size() == 0) { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + // Determine sensible dot kernel NDRange configuration if (dev.is_cpu()) { From 6581ee63b809f446a0e4ad3ec479e9b76e0c6591 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 17 May 2021 15:33:54 +0100 Subject: [PATCH 05/14] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29702d7..3dbabed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. - Support for CUDA Managed Memory and Page Fault memory. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. +- SYCL device check for FP64 support. ### Changed - Default branch renamed from `master` to `main`. From 45ebd09ef27c4c8da2cc45c07e01070e53e1e07b Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 17 May 2021 20:00:00 +0100 Subject: [PATCH 06/14] Don't use hardcoded rocm path --- ci-prepare-bionic.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index d8ae312..290e87b 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -240,8 +240,7 @@ setup_rocm() { echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update -qq sudo apt-get install -y -qq rocm-dev - # AMD needs this rocm_path thing exported... - export_var ROCM_PATH "/opt/rocm-4.1.0" + export_var ROCM_PATH "/opt/rocm" export_var HIP_CXX "$ROCM_PATH/bin/hipcc" verify_bin_exists "$HIP_CXX" "$HIP_CXX" --version From b772d00fe407bfb0499b05d79513bc7986dbff3a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 18 May 2021 16:44:06 +0100 Subject: [PATCH 07/14] Revert "Add check for FP64 support" --- CHANGELOG.md | 1 - SYCLStream.cpp | 8 -------- 2 files changed, 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dbabed..29702d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,6 @@ All notable changes to this project will be documented in this file. - Support for CUDA Managed Memory and Page Fault memory. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. -- SYCL device check for FP64 support. ### Changed - Default branch renamed from `master` to `main`. diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 00c043f..49ad3ac 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -28,14 +28,6 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; - // Check device can support FP64 if needed - if (sizeof(T) == sizeof(double)) - { - if (dev.get_info().size() == 0) { - throw std::runtime_error("Device does not support double precision, please use --float"); - } - } - // Determine sensible dot kernel NDRange configuration if (dev.is_cpu()) { From 82dedad6766d12909edce2437cd20f17ca47d1f8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 17 May 2021 15:25:43 +0100 Subject: [PATCH 08/14] [SYCL 1.2.1] Add check for FP64 support Fixes #98 --- SYCLStream.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 49ad3ac..00c043f 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -28,6 +28,14 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (dev.get_info().size() == 0) { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + // Determine sensible dot kernel NDRange configuration if (dev.is_cpu()) { From 2ab68ab39e8604f67f5f98ec89d125654fabd0b4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 17 May 2021 15:33:54 +0100 Subject: [PATCH 09/14] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29702d7..3dbabed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. - Support for CUDA Managed Memory and Page Fault memory. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. +- SYCL device check for FP64 support. ### Changed - Default branch renamed from `master` to `main`. From 742f0629be70c143e05df4ecf73bcf71bba150a7 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 09:28:40 +0100 Subject: [PATCH 10/14] Initial TBB implementation --- .gitignore | 1 + CMakeLists.txt | 1 + README.md | 3 +- TBB.cmake | 10 +++ TBB.make | 28 ++++++++ TBBStream.cpp | 157 +++++++++++++++++++++++++++++++++++++++++++ TBBStream.hpp | 56 +++++++++++++++ ci-prepare-bionic.sh | 16 +++++ ci-test-compile.sh | 5 +- main.cpp | 6 ++ 10 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 TBB.cmake create mode 100644 TBB.make create mode 100644 TBBStream.cpp create mode 100644 TBBStream.hpp diff --git a/.gitignore b/.gitignore index c3ea1da..31af301 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ kokkos-stream std-stream sycl-stream hip-stream +tbb-stream *.o *.bc diff --git a/CMakeLists.txt b/CMakeLists.txt index d4a11cd..797a9c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp) register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) +register_model(TBB TBB TBBStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/README.md b/README.md index 8ca7398..68908a3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - TBB This code was previously called GPU-STREAM. @@ -90,7 +91,7 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA +-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB -- Selected model : OCL -- Supported flags: diff --git a/TBB.cmake b/TBB.cmake new file mode 100644 index 0000000..a92ea82 --- /dev/null +++ b/TBB.cmake @@ -0,0 +1,10 @@ + +register_flag_required(TBB_DIR + "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/") + +macro(setup) + set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages + find_package(TBB REQUIRED) + register_link_library(TBB::tbb) +endmacro() diff --git a/TBB.make b/TBB.make new file mode 100644 index 0000000..e3b5c86 --- /dev/null +++ b/TBB.make @@ -0,0 +1,28 @@ + +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +TBB_LIB= + +COMPILER_GNU = g++ +CXX = $(COMPILER_$(COMPILER)) + +FLAGS_GNU = -O3 -std=c++14 -march=native +CXXFLAGS = $(FLAGS_$(COMPILER)) + + +tbb-stream: main.cpp TBBStream.cpp + $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + +.PHONY: clean +clean: + rm -f tbb-stream + diff --git a/TBBStream.cpp b/TBBStream.cpp new file mode 100644 index 0000000..4201796 --- /dev/null +++ b/TBBStream.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "TBBStream.hpp" +#include "oneapi/tbb.h" + +template +TBBStream::TBBStream(const int ARRAY_SIZE, int device) + : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +{ + std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; +} + +template +template +U TBBStream::with_partitioner(const F &f) +{ + switch(partitioner){ + case Partitioner::Auto: return f(tbb::auto_partitioner{}); + case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here + case Partitioner::Static: return f(tbb::static_partitioner{}); + case Partitioner::Simple: return f(tbb::simple_partitioner{}); + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +template +template +void TBBStream::parallel_for(const F &f) +{ + // using size_t as per the range type (also used in the official documentation) + with_partitioner([&](auto &&p) { + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + f(i); + } + }, p); + return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is + }); +} + +template +template +T TBBStream::parallel_reduce(T init, const Op &op, const F &f) +{ + return with_partitioner([&](auto &&p) { + return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc = op(acc, f(i)); + } + return acc; + }, op, p); + }); +} + +template +void TBBStream::init_arrays(T initA, T initB, T initC) +{ + + parallel_for([&](size_t i){ + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); + +} + +template +void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + // Element-wise copy. + h_a = a; + h_b = b; + h_c = c; +} + +template +void TBBStream::copy() +{ + parallel_for([&](size_t i){ c[i] = a[i]; }); +} + +template +void TBBStream::mul() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); + +} + +template +void TBBStream::add() +{ + + parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + +} + +template +void TBBStream::triad() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + +} + +template +void TBBStream::nstream() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + +} + +template +T TBBStream::dot() +{ + // sum += a[i] * b[i]; + return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); +} + +void listDevices(void) +{ + std::cout + << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" + << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" + << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" + << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" + << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" + << std::endl; +} + +std::string getDeviceName(const int device) +{ + switch(static_cast(device)){ + case Partitioner::Auto: return "auto_partitioner"; + case Partitioner::Affinity: return "affinity_partitioner"; + case Partitioner::Static: return "static_partitioner"; + case Partitioner::Simple: return "simple_partitioner"; + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class TBBStream; +template class TBBStream; + diff --git a/TBBStream.hpp b/TBBStream.hpp new file mode 100644 index 0000000..dc87ea6 --- /dev/null +++ b/TBBStream.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "oneapi/tbb.h" +#include "Stream.h" + +#define IMPLEMENTATION_STRING "TBB" + +enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; + +template +class TBBStream : public Stream +{ + protected: + + + Partitioner partitioner; + tbb::blocked_range range; + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + + template < typename U, typename F> + U with_partitioner(const F &f); + + template + void parallel_for(const F &f); + + template + T parallel_reduce(T init, const Op &op, const F &f); + + public: + TBBStream(const int, int); + ~TBBStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index 290e87b..fb69c05 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -208,6 +208,20 @@ setup_raja() { check_size } +setup_tbb() { + echo "Preparing TBB" + local tbb_ver="2021.2.0" + local tarball="oneapi-tbb-$tbb_ver-lin.tgz" + + local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" + # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz" + + get_and_untar "$tarball" "$url" + export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver" + verify_dir_exists "$TBB_LIB" + check_size +} + setup_clang_gcc() { echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list @@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then setup_dpcpp & setup_kokkos & setup_raja & + setup_tbb & wait else setup_cmake @@ -364,6 +379,7 @@ else setup_dpcpp setup_kokkos setup_raja + setup_tbb # these need apt setup_clang_gcc setup_rocm diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 1b5c1bb..24a7091 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -112,6 +112,8 @@ run_build() { #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" # +#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" +# #GCC_STD_PAR_LIB="tbb" #CLANG_STD_PAR_LIB="tbb" #GCC_OMP_OFFLOAD_AMD=false @@ -138,7 +140,7 @@ build_gcc() { # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - + run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -188,6 +190,7 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } diff --git a/main.cpp b/main.cpp index e78d7a1..de301ce 100644 --- a/main.cpp +++ b/main.cpp @@ -25,6 +25,8 @@ #include "STDStream.h" #elif defined(STD20) #include "STD20Stream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -266,6 +268,10 @@ void run() // Use the C++20 implementation stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(TBB) + // Use the C++20 implementation + stream = new TBBStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex); From 7a130a59bc3c4b827cb1f5f95add54b0501110bd Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 10:23:06 +0100 Subject: [PATCH 11/14] Don't tie implementation to oneTBB specific headers Fix wrong TBB_ROOT detection --- TBB.cmake | 12 +++++-- TBBStream.cpp | 2 +- ci-test-compile.sh | 78 +++++++++++++++++++++++++--------------------- 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/TBB.cmake b/TBB.cmake index a92ea82..99e31f7 100644 --- a/TBB.cmake +++ b/TBB.cmake @@ -1,8 +1,16 @@ -register_flag_required(TBB_DIR - "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/") +register_flag_optional(ONE_TBB_DIR + "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + "") macro(setup) + if(ONE_TBB_DIR) + set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 + # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years + endif() + + set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) diff --git a/TBBStream.cpp b/TBBStream.cpp index 4201796..1d09927 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -5,7 +5,7 @@ // source code #include "TBBStream.hpp" -#include "oneapi/tbb.h" +#include "tbb/tbb.h" template TBBStream::TBBStream(const int ARRAY_SIZE, int device) diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 24a7091..456f836 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -86,40 +86,40 @@ run_build() { } ### -#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" -#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" -# -#GCC_CXX="/usr/bin/g++" -#CLANG_CXX="/usr/bin/clang++" -# -#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x -# -#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" -#AOMP_CXX="/usr/lib/aomp/bin/clang++" -#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" -# -## AMD needs this rocm_path thing exported... -#export ROCM_PATH="/opt/rocm-4.0.0" -#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" -#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" -#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" -#HIPSYCL_DIR="/opt/hipsycl/cff515c/" -# -#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" -#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" -# -#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" -# -#GCC_STD_PAR_LIB="tbb" -#CLANG_STD_PAR_LIB="tbb" -#GCC_OMP_OFFLOAD_AMD=false -#GCC_OMP_OFFLOAD_NVIDIA=true -#CLANG_OMP_OFFLOAD_AMD=false -#CLANG_OMP_OFFLOAD_NVIDIA=false +# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" +# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" + +# GCC_CXX="/usr/bin/g++" +# CLANG_CXX="/usr/bin/clang++" + +# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" +# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" +# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" +# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" +# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x + +# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" +# AOMP_CXX="/usr/lib/aomp/bin/clang++" +# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" + +# # AMD needs this rocm_path thing exported... +# export ROCM_PATH="/opt/rocm-4.0.0" +# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" +# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" +# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" +# HIPSYCL_DIR="/opt/hipsycl/cff515c/" + +# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" +# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" + +# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" + +# GCC_STD_PAR_LIB="tbb" +# CLANG_STD_PAR_LIB="tbb" +# GCC_OMP_OFFLOAD_AMD=false +# GCC_OMP_OFFLOAD_NVIDIA=true +# CLANG_OMP_OFFLOAD_AMD=false +# CLANG_OMP_OFFLOAD_NVIDIA=false ### AMD_ARCH="gfx_903" @@ -140,7 +140,10 @@ build_gcc() { # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" + + run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -190,7 +193,10 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" + + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } From 0867115d8dcc6d6c9b63b56dc3a124f230f626c5 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 10:51:45 +0100 Subject: [PATCH 12/14] Remove references to oneapi/tbb.h --- TBBStream.cpp | 1 - TBBStream.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TBBStream.cpp b/TBBStream.cpp index 1d09927..08b83c8 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -5,7 +5,6 @@ // source code #include "TBBStream.hpp" -#include "tbb/tbb.h" template TBBStream::TBBStream(const int ARRAY_SIZE, int device) diff --git a/TBBStream.hpp b/TBBStream.hpp index dc87ea6..6ba9741 100644 --- a/TBBStream.hpp +++ b/TBBStream.hpp @@ -8,7 +8,7 @@ #include #include -#include "oneapi/tbb.h" +#include "tbb/tbb.h" #include "Stream.h" #define IMPLEMENTATION_STRING "TBB" From 0e3727d8f853195cb6bc29307159749fa74d9947 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 3 Jun 2021 13:43:12 +0100 Subject: [PATCH 13/14] Make partitioner a compile option Inline all abstractions Add intel compilers for Make --- TBB.cmake | 15 ++++++- TBB.make | 40 ++++++++++++++--- TBBStream.cpp | 116 ++++++++++++++++++++------------------------------ TBBStream.hpp | 32 ++++++++------ 4 files changed, 113 insertions(+), 90 deletions(-) diff --git a/TBB.cmake b/TBB.cmake index 99e31f7..e4d6bac 100644 --- a/TBB.cmake +++ b/TBB.cmake @@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") + +register_flag_optional(PARTITIONER + "Partitioner specifies how a loop template should partition its work among threads. + Possible values are: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." + "AUTO") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 - # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years + # docs on Intel's website refers to TBB_DIR which is not correct endif() - set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) + register_definitions(PARTITIONER_${PARTITIONER}) endmacro() diff --git a/TBB.make b/TBB.make index e3b5c86..c224a5a 100644 --- a/TBB.make +++ b/TBB.make @@ -3,24 +3,52 @@ ifndef COMPILER define compiler_help Set COMPILER to change flags (defaulting to GNU). Available compilers are: - GNU + GNU INTEL INTEL_LEGACY endef $(info $(compiler_help)) COMPILER=GNU endif -TBB_LIB= -COMPILER_GNU = g++ +CXX_GNU = g++ +CXX_INTEL = icpx +CXX_INTEL_LEGACY = icpc CXX = $(COMPILER_$(COMPILER)) -FLAGS_GNU = -O3 -std=c++14 -march=native -CXXFLAGS = $(FLAGS_$(COMPILER)) +CXXFLAGS_GNU = -march=native +CXXFLAGS_INTEL = -march=native +CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) + + + +ifndef PARTITIONER +define partitioner_help +Set PARTITIONER to select TBB's partitioner. +Partitioner specifies how a loop template should partition its work among threads. + +Available options: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + +See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners +for more details. + +endef +$(info $(partitioner_help)) +PARTITIONER=AUTO +endif + +PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) tbb-stream: main.cpp TBBStream.cpp - $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ .PHONY: clean clean: diff --git a/TBBStream.cpp b/TBBStream.cpp index 08b83c8..9c34a50 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -8,62 +8,26 @@ template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) { - std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; -} - -template -template -U TBBStream::with_partitioner(const F &f) -{ - switch(partitioner){ - case Partitioner::Auto: return f(tbb::auto_partitioner{}); - case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here - case Partitioner::Static: return f(tbb::static_partitioner{}); - case Partitioner::Simple: return f(tbb::simple_partitioner{}); - default: throw std::runtime_error("Error asking for name for non-existant device"); + if(device != 0){ + throw std::runtime_error("Device != 0 is not supported by TBB"); } + std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; } -template -template -void TBBStream::parallel_for(const F &f) -{ - // using size_t as per the range type (also used in the official documentation) - with_partitioner([&](auto &&p) { - tbb::parallel_for(range, [&](const tbb::blocked_range& r) { - for (size_t i = r.begin(); i < r.end(); ++i) { - f(i); - } - }, p); - return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is - }); -} - -template -template -T TBBStream::parallel_reduce(T init, const Op &op, const F &f) -{ - return with_partitioner([&](auto &&p) { - return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { - for (size_t i = r.begin(); i < r.end(); ++i) { - acc = op(acc, f(i)); - } - return acc; - }, op, p); - }); -} template void TBBStream::init_arrays(T initA, T initB, T initC) { - parallel_for([&](size_t i){ - a[i] = initA; - b[i] = initB; - c[i] = initC; - }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + }, partitioner); } @@ -79,23 +43,35 @@ void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve template void TBBStream::copy() { - parallel_for([&](size_t i){ c[i] = a[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i]; + } + }, partitioner); } template void TBBStream::mul() { const T scalar = startScalar; - - parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); - + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + b[i] = scalar * c[i]; + } + }, partitioner); + } template void TBBStream::add() { - parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i] + b[i]; + } + }, partitioner); } @@ -104,7 +80,11 @@ void TBBStream::triad() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = b[i] + scalar * c[i]; + } + }, partitioner); } @@ -113,7 +93,11 @@ void TBBStream::nstream() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] += b[i] + scalar * c[i]; + } + }, partitioner); } @@ -121,29 +105,23 @@ template T TBBStream::dot() { // sum += a[i] * b[i]; - return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); + return + tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc += a[i] * b[i]; + } + return acc; + }, std::plus(), partitioner); } void listDevices(void) { - std::cout - << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" - << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" - << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" - << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" - << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" - << std::endl; + std::cout << "Listing devices is not supported by TBB" << std::endl; } std::string getDeviceName(const int device) { - switch(static_cast(device)){ - case Partitioner::Auto: return "auto_partitioner"; - case Partitioner::Affinity: return "affinity_partitioner"; - case Partitioner::Static: return "static_partitioner"; - case Partitioner::Simple: return "simple_partitioner"; - default: throw std::runtime_error("Error asking for name for non-existant device"); - } + return std::string("Device name unavailable"); } std::string getDeviceDriver(const int) diff --git a/TBBStream.hpp b/TBBStream.hpp index 6ba9741..90763a9 100644 --- a/TBBStream.hpp +++ b/TBBStream.hpp @@ -13,31 +13,37 @@ #define IMPLEMENTATION_STRING "TBB" -enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; +#if defined(PARTITIONER_AUTO) +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#elif defined(PARTITIONER_AFFINITY) +using tbb_partitioner = tbb::affinity_partitioner; +#define PARTITIONER_NAME "affinity_partitioner" +#elif defined(PARTITIONER_STATIC) +using tbb_partitioner = tbb::static_partitioner; +#define PARTITIONER_NAME "static_partitioner" +#elif defined(PARTITIONER_SIMPLE) +using tbb_partitioner = tbb::simple_partitioner; +#define PARTITIONER_NAME "simple_partitioner" +#else +// default to auto +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#endif + template class TBBStream : public Stream { protected: - - Partitioner partitioner; + tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers std::vector a; std::vector b; std::vector c; - - - template < typename U, typename F> - U with_partitioner(const F &f); - template - void parallel_for(const F &f); - - template - T parallel_reduce(T init, const Op &op, const F &f); - public: TBBStream(const int, int); ~TBBStream() = default; From 25e021caa351aba0d5bcf32cbf639d55370d41ee Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 3 Jun 2021 16:08:14 +0100 Subject: [PATCH 14/14] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dbabed..5d209e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. - SYCL device check for FP64 support. +- New implementation using TBB. ### Changed - Default branch renamed from `master` to `main`.