From 742f0629be70c143e05df4ecf73bcf71bba150a7 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 09:28:40 +0100 Subject: [PATCH 1/4] Initial TBB implementation --- .gitignore | 1 + CMakeLists.txt | 1 + README.md | 3 +- TBB.cmake | 10 +++ TBB.make | 28 ++++++++ TBBStream.cpp | 157 +++++++++++++++++++++++++++++++++++++++++++ TBBStream.hpp | 56 +++++++++++++++ ci-prepare-bionic.sh | 16 +++++ ci-test-compile.sh | 5 +- main.cpp | 6 ++ 10 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 TBB.cmake create mode 100644 TBB.make create mode 100644 TBBStream.cpp create mode 100644 TBBStream.hpp diff --git a/.gitignore b/.gitignore index c3ea1da..31af301 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ kokkos-stream std-stream sycl-stream hip-stream +tbb-stream *.o *.bc diff --git a/CMakeLists.txt b/CMakeLists.txt index d4a11cd..797a9c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp) register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) +register_model(TBB TBB TBBStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/README.md b/README.md index 8ca7398..68908a3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - TBB This code was previously called GPU-STREAM. @@ -90,7 +91,7 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA +-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB -- Selected model : OCL -- Supported flags: diff --git a/TBB.cmake b/TBB.cmake new file mode 100644 index 0000000..a92ea82 --- /dev/null +++ b/TBB.cmake @@ -0,0 +1,10 @@ + +register_flag_required(TBB_DIR + "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/") + +macro(setup) + set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages + find_package(TBB REQUIRED) + register_link_library(TBB::tbb) +endmacro() diff --git a/TBB.make b/TBB.make new file mode 100644 index 0000000..e3b5c86 --- /dev/null +++ b/TBB.make @@ -0,0 +1,28 @@ + +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +TBB_LIB= + +COMPILER_GNU = g++ +CXX = $(COMPILER_$(COMPILER)) + +FLAGS_GNU = -O3 -std=c++14 -march=native +CXXFLAGS = $(FLAGS_$(COMPILER)) + + +tbb-stream: main.cpp TBBStream.cpp + $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + +.PHONY: clean +clean: + rm -f tbb-stream + diff --git a/TBBStream.cpp b/TBBStream.cpp new file mode 100644 index 0000000..4201796 --- /dev/null +++ b/TBBStream.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "TBBStream.hpp" +#include "oneapi/tbb.h" + +template +TBBStream::TBBStream(const int ARRAY_SIZE, int device) + : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +{ + std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; +} + +template +template +U TBBStream::with_partitioner(const F &f) +{ + switch(partitioner){ + case Partitioner::Auto: return f(tbb::auto_partitioner{}); + case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here + case Partitioner::Static: return f(tbb::static_partitioner{}); + case Partitioner::Simple: return f(tbb::simple_partitioner{}); + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +template +template +void TBBStream::parallel_for(const F &f) +{ + // using size_t as per the range type (also used in the official documentation) + with_partitioner([&](auto &&p) { + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + f(i); + } + }, p); + return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is + }); +} + +template +template +T TBBStream::parallel_reduce(T init, const Op &op, const F &f) +{ + return with_partitioner([&](auto &&p) { + return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc = op(acc, f(i)); + } + return acc; + }, op, p); + }); +} + +template +void TBBStream::init_arrays(T initA, T initB, T initC) +{ + + parallel_for([&](size_t i){ + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); + +} + +template +void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + // Element-wise copy. + h_a = a; + h_b = b; + h_c = c; +} + +template +void TBBStream::copy() +{ + parallel_for([&](size_t i){ c[i] = a[i]; }); +} + +template +void TBBStream::mul() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); + +} + +template +void TBBStream::add() +{ + + parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + +} + +template +void TBBStream::triad() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + +} + +template +void TBBStream::nstream() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + +} + +template +T TBBStream::dot() +{ + // sum += a[i] * b[i]; + return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); +} + +void listDevices(void) +{ + std::cout + << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" + << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" + << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" + << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" + << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" + << std::endl; +} + +std::string getDeviceName(const int device) +{ + switch(static_cast(device)){ + case Partitioner::Auto: return "auto_partitioner"; + case Partitioner::Affinity: return "affinity_partitioner"; + case Partitioner::Static: return "static_partitioner"; + case Partitioner::Simple: return "simple_partitioner"; + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class TBBStream; +template class TBBStream; + diff --git a/TBBStream.hpp b/TBBStream.hpp new file mode 100644 index 0000000..dc87ea6 --- /dev/null +++ b/TBBStream.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "oneapi/tbb.h" +#include "Stream.h" + +#define IMPLEMENTATION_STRING "TBB" + +enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; + +template +class TBBStream : public Stream +{ + protected: + + + Partitioner partitioner; + tbb::blocked_range range; + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + + template < typename U, typename F> + U with_partitioner(const F &f); + + template + void parallel_for(const F &f); + + template + T parallel_reduce(T init, const Op &op, const F &f); + + public: + TBBStream(const int, int); + ~TBBStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index 290e87b..fb69c05 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -208,6 +208,20 @@ setup_raja() { check_size } +setup_tbb() { + echo "Preparing TBB" + local tbb_ver="2021.2.0" + local tarball="oneapi-tbb-$tbb_ver-lin.tgz" + + local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" + # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz" + + get_and_untar "$tarball" "$url" + export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver" + verify_dir_exists "$TBB_LIB" + check_size +} + setup_clang_gcc() { echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list @@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then setup_dpcpp & setup_kokkos & setup_raja & + setup_tbb & wait else setup_cmake @@ -364,6 +379,7 @@ else setup_dpcpp setup_kokkos setup_raja + setup_tbb # these need apt setup_clang_gcc setup_rocm diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 1b5c1bb..24a7091 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -112,6 +112,8 @@ run_build() { #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" # +#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" +# #GCC_STD_PAR_LIB="tbb" #CLANG_STD_PAR_LIB="tbb" #GCC_OMP_OFFLOAD_AMD=false @@ -138,7 +140,7 @@ build_gcc() { # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - + run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -188,6 +190,7 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } diff --git a/main.cpp b/main.cpp index e78d7a1..de301ce 100644 --- a/main.cpp +++ b/main.cpp @@ -25,6 +25,8 @@ #include "STDStream.h" #elif defined(STD20) #include "STD20Stream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -266,6 +268,10 @@ void run() // Use the C++20 implementation stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(TBB) + // Use the C++20 implementation + stream = new TBBStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex); From 7a130a59bc3c4b827cb1f5f95add54b0501110bd Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 10:23:06 +0100 Subject: [PATCH 2/4] Don't tie implementation to oneTBB specific headers Fix wrong TBB_ROOT detection --- TBB.cmake | 12 +++++-- TBBStream.cpp | 2 +- ci-test-compile.sh | 78 +++++++++++++++++++++++++--------------------- 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/TBB.cmake b/TBB.cmake index a92ea82..99e31f7 100644 --- a/TBB.cmake +++ b/TBB.cmake @@ -1,8 +1,16 @@ -register_flag_required(TBB_DIR - "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/") +register_flag_optional(ONE_TBB_DIR + "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + "") macro(setup) + if(ONE_TBB_DIR) + set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 + # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years + endif() + + set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) diff --git a/TBBStream.cpp b/TBBStream.cpp index 4201796..1d09927 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -5,7 +5,7 @@ // source code #include "TBBStream.hpp" -#include "oneapi/tbb.h" +#include "tbb/tbb.h" template TBBStream::TBBStream(const int ARRAY_SIZE, int device) diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 24a7091..456f836 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -86,40 +86,40 @@ run_build() { } ### -#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" -#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" -# -#GCC_CXX="/usr/bin/g++" -#CLANG_CXX="/usr/bin/clang++" -# -#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x -# -#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" -#AOMP_CXX="/usr/lib/aomp/bin/clang++" -#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" -# -## AMD needs this rocm_path thing exported... -#export ROCM_PATH="/opt/rocm-4.0.0" -#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" -#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" -#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" -#HIPSYCL_DIR="/opt/hipsycl/cff515c/" -# -#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" -#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" -# -#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" -# -#GCC_STD_PAR_LIB="tbb" -#CLANG_STD_PAR_LIB="tbb" -#GCC_OMP_OFFLOAD_AMD=false -#GCC_OMP_OFFLOAD_NVIDIA=true -#CLANG_OMP_OFFLOAD_AMD=false -#CLANG_OMP_OFFLOAD_NVIDIA=false +# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" +# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" + +# GCC_CXX="/usr/bin/g++" +# CLANG_CXX="/usr/bin/clang++" + +# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" +# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" +# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" +# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" +# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x + +# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" +# AOMP_CXX="/usr/lib/aomp/bin/clang++" +# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" + +# # AMD needs this rocm_path thing exported... +# export ROCM_PATH="/opt/rocm-4.0.0" +# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" +# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" +# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" +# HIPSYCL_DIR="/opt/hipsycl/cff515c/" + +# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" +# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" + +# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" + +# GCC_STD_PAR_LIB="tbb" +# CLANG_STD_PAR_LIB="tbb" +# GCC_OMP_OFFLOAD_AMD=false +# GCC_OMP_OFFLOAD_NVIDIA=true +# CLANG_OMP_OFFLOAD_AMD=false +# CLANG_OMP_OFFLOAD_NVIDIA=false ### AMD_ARCH="gfx_903" @@ -140,7 +140,10 @@ build_gcc() { # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" + + run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -190,7 +193,10 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" + + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } From 0867115d8dcc6d6c9b63b56dc3a124f230f626c5 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 10:51:45 +0100 Subject: [PATCH 3/4] Remove references to oneapi/tbb.h --- TBBStream.cpp | 1 - TBBStream.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TBBStream.cpp b/TBBStream.cpp index 1d09927..08b83c8 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -5,7 +5,6 @@ // source code #include "TBBStream.hpp" -#include "tbb/tbb.h" template TBBStream::TBBStream(const int ARRAY_SIZE, int device) diff --git a/TBBStream.hpp b/TBBStream.hpp index dc87ea6..6ba9741 100644 --- a/TBBStream.hpp +++ b/TBBStream.hpp @@ -8,7 +8,7 @@ #include #include -#include "oneapi/tbb.h" +#include "tbb/tbb.h" #include "Stream.h" #define IMPLEMENTATION_STRING "TBB" From 0e3727d8f853195cb6bc29307159749fa74d9947 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 3 Jun 2021 13:43:12 +0100 Subject: [PATCH 4/4] Make partitioner a compile option Inline all abstractions Add intel compilers for Make --- TBB.cmake | 15 ++++++- TBB.make | 40 ++++++++++++++--- TBBStream.cpp | 116 ++++++++++++++++++++------------------------------ TBBStream.hpp | 32 ++++++++------ 4 files changed, 113 insertions(+), 90 deletions(-) diff --git a/TBB.cmake b/TBB.cmake index 99e31f7..e4d6bac 100644 --- a/TBB.cmake +++ b/TBB.cmake @@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") + +register_flag_optional(PARTITIONER + "Partitioner specifies how a loop template should partition its work among threads. + Possible values are: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." + "AUTO") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 - # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years + # docs on Intel's website refers to TBB_DIR which is not correct endif() - set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) + register_definitions(PARTITIONER_${PARTITIONER}) endmacro() diff --git a/TBB.make b/TBB.make index e3b5c86..c224a5a 100644 --- a/TBB.make +++ b/TBB.make @@ -3,24 +3,52 @@ ifndef COMPILER define compiler_help Set COMPILER to change flags (defaulting to GNU). Available compilers are: - GNU + GNU INTEL INTEL_LEGACY endef $(info $(compiler_help)) COMPILER=GNU endif -TBB_LIB= -COMPILER_GNU = g++ +CXX_GNU = g++ +CXX_INTEL = icpx +CXX_INTEL_LEGACY = icpc CXX = $(COMPILER_$(COMPILER)) -FLAGS_GNU = -O3 -std=c++14 -march=native -CXXFLAGS = $(FLAGS_$(COMPILER)) +CXXFLAGS_GNU = -march=native +CXXFLAGS_INTEL = -march=native +CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) + + + +ifndef PARTITIONER +define partitioner_help +Set PARTITIONER to select TBB's partitioner. +Partitioner specifies how a loop template should partition its work among threads. + +Available options: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + +See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners +for more details. + +endef +$(info $(partitioner_help)) +PARTITIONER=AUTO +endif + +PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) tbb-stream: main.cpp TBBStream.cpp - $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ .PHONY: clean clean: diff --git a/TBBStream.cpp b/TBBStream.cpp index 08b83c8..9c34a50 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -8,62 +8,26 @@ template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) { - std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; -} - -template -template -U TBBStream::with_partitioner(const F &f) -{ - switch(partitioner){ - case Partitioner::Auto: return f(tbb::auto_partitioner{}); - case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here - case Partitioner::Static: return f(tbb::static_partitioner{}); - case Partitioner::Simple: return f(tbb::simple_partitioner{}); - default: throw std::runtime_error("Error asking for name for non-existant device"); + if(device != 0){ + throw std::runtime_error("Device != 0 is not supported by TBB"); } + std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; } -template -template -void TBBStream::parallel_for(const F &f) -{ - // using size_t as per the range type (also used in the official documentation) - with_partitioner([&](auto &&p) { - tbb::parallel_for(range, [&](const tbb::blocked_range& r) { - for (size_t i = r.begin(); i < r.end(); ++i) { - f(i); - } - }, p); - return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is - }); -} - -template -template -T TBBStream::parallel_reduce(T init, const Op &op, const F &f) -{ - return with_partitioner([&](auto &&p) { - return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { - for (size_t i = r.begin(); i < r.end(); ++i) { - acc = op(acc, f(i)); - } - return acc; - }, op, p); - }); -} template void TBBStream::init_arrays(T initA, T initB, T initC) { - parallel_for([&](size_t i){ - a[i] = initA; - b[i] = initB; - c[i] = initC; - }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + }, partitioner); } @@ -79,23 +43,35 @@ void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve template void TBBStream::copy() { - parallel_for([&](size_t i){ c[i] = a[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i]; + } + }, partitioner); } template void TBBStream::mul() { const T scalar = startScalar; - - parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); - + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + b[i] = scalar * c[i]; + } + }, partitioner); + } template void TBBStream::add() { - parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i] + b[i]; + } + }, partitioner); } @@ -104,7 +80,11 @@ void TBBStream::triad() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = b[i] + scalar * c[i]; + } + }, partitioner); } @@ -113,7 +93,11 @@ void TBBStream::nstream() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] += b[i] + scalar * c[i]; + } + }, partitioner); } @@ -121,29 +105,23 @@ template T TBBStream::dot() { // sum += a[i] * b[i]; - return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); + return + tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc += a[i] * b[i]; + } + return acc; + }, std::plus(), partitioner); } void listDevices(void) { - std::cout - << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" - << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" - << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" - << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" - << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" - << std::endl; + std::cout << "Listing devices is not supported by TBB" << std::endl; } std::string getDeviceName(const int device) { - switch(static_cast(device)){ - case Partitioner::Auto: return "auto_partitioner"; - case Partitioner::Affinity: return "affinity_partitioner"; - case Partitioner::Static: return "static_partitioner"; - case Partitioner::Simple: return "simple_partitioner"; - default: throw std::runtime_error("Error asking for name for non-existant device"); - } + return std::string("Device name unavailable"); } std::string getDeviceDriver(const int) diff --git a/TBBStream.hpp b/TBBStream.hpp index 6ba9741..90763a9 100644 --- a/TBBStream.hpp +++ b/TBBStream.hpp @@ -13,31 +13,37 @@ #define IMPLEMENTATION_STRING "TBB" -enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; +#if defined(PARTITIONER_AUTO) +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#elif defined(PARTITIONER_AFFINITY) +using tbb_partitioner = tbb::affinity_partitioner; +#define PARTITIONER_NAME "affinity_partitioner" +#elif defined(PARTITIONER_STATIC) +using tbb_partitioner = tbb::static_partitioner; +#define PARTITIONER_NAME "static_partitioner" +#elif defined(PARTITIONER_SIMPLE) +using tbb_partitioner = tbb::simple_partitioner; +#define PARTITIONER_NAME "simple_partitioner" +#else +// default to auto +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#endif + template class TBBStream : public Stream { protected: - - Partitioner partitioner; + tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers std::vector a; std::vector b; std::vector c; - - - template < typename U, typename F> - U with_partitioner(const F &f); - template - void parallel_for(const F &f); - - template - T parallel_reduce(T init, const Op &op, const F &f); - public: TBBStream(const int, int); ~TBBStream() = default;