From 742f0629be70c143e05df4ecf73bcf71bba150a7 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 27 May 2021 09:28:40 +0100 Subject: [PATCH] Initial TBB implementation --- .gitignore | 1 + CMakeLists.txt | 1 + README.md | 3 +- TBB.cmake | 10 +++ TBB.make | 28 ++++++++ TBBStream.cpp | 157 +++++++++++++++++++++++++++++++++++++++++++ TBBStream.hpp | 56 +++++++++++++++ ci-prepare-bionic.sh | 16 +++++ ci-test-compile.sh | 5 +- main.cpp | 6 ++ 10 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 TBB.cmake create mode 100644 TBB.make create mode 100644 TBBStream.cpp create mode 100644 TBBStream.hpp diff --git a/.gitignore b/.gitignore index c3ea1da..31af301 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ kokkos-stream std-stream sycl-stream hip-stream +tbb-stream *.o *.bc diff --git a/CMakeLists.txt b/CMakeLists.txt index d4a11cd..797a9c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp) register_model(ACC ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(RAJA USE_RAJA RAJAStream.cpp) +register_model(TBB TBB TBBStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/README.md b/README.md index 8ca7398..68908a3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - TBB This code was previously called GPU-STREAM. @@ -90,7 +91,7 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA +-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB -- Selected model : OCL -- Supported flags: diff --git a/TBB.cmake b/TBB.cmake new file mode 100644 index 0000000..a92ea82 --- /dev/null +++ b/TBB.cmake @@ -0,0 +1,10 @@ + +register_flag_required(TBB_DIR + "Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/") + +macro(setup) + set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages + find_package(TBB REQUIRED) + register_link_library(TBB::tbb) +endmacro() diff --git a/TBB.make b/TBB.make new file mode 100644 index 0000000..e3b5c86 --- /dev/null +++ b/TBB.make @@ -0,0 +1,28 @@ + +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +TBB_LIB= + +COMPILER_GNU = g++ +CXX = $(COMPILER_$(COMPILER)) + +FLAGS_GNU = -O3 -std=c++14 -march=native +CXXFLAGS = $(FLAGS_$(COMPILER)) + + +tbb-stream: main.cpp TBBStream.cpp + $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + +.PHONY: clean +clean: + rm -f tbb-stream + diff --git a/TBBStream.cpp b/TBBStream.cpp new file mode 100644 index 0000000..4201796 --- /dev/null +++ b/TBBStream.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "TBBStream.hpp" +#include "oneapi/tbb.h" + +template +TBBStream::TBBStream(const int ARRAY_SIZE, int device) + : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +{ + std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; +} + +template +template +U TBBStream::with_partitioner(const F &f) +{ + switch(partitioner){ + case Partitioner::Auto: return f(tbb::auto_partitioner{}); + case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here + case Partitioner::Static: return f(tbb::static_partitioner{}); + case Partitioner::Simple: return f(tbb::simple_partitioner{}); + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +template +template +void TBBStream::parallel_for(const F &f) +{ + // using size_t as per the range type (also used in the official documentation) + with_partitioner([&](auto &&p) { + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + f(i); + } + }, p); + return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is + }); +} + +template +template +T TBBStream::parallel_reduce(T init, const Op &op, const F &f) +{ + return with_partitioner([&](auto &&p) { + return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc = op(acc, f(i)); + } + return acc; + }, op, p); + }); +} + +template +void TBBStream::init_arrays(T initA, T initB, T initC) +{ + + parallel_for([&](size_t i){ + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); + +} + +template +void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + // Element-wise copy. + h_a = a; + h_b = b; + h_c = c; +} + +template +void TBBStream::copy() +{ + parallel_for([&](size_t i){ c[i] = a[i]; }); +} + +template +void TBBStream::mul() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); + +} + +template +void TBBStream::add() +{ + + parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + +} + +template +void TBBStream::triad() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + +} + +template +void TBBStream::nstream() +{ + const T scalar = startScalar; + + parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + +} + +template +T TBBStream::dot() +{ + // sum += a[i] * b[i]; + return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); +} + +void listDevices(void) +{ + std::cout + << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" + << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" + << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" + << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" + << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" + << std::endl; +} + +std::string getDeviceName(const int device) +{ + switch(static_cast(device)){ + case Partitioner::Auto: return "auto_partitioner"; + case Partitioner::Affinity: return "affinity_partitioner"; + case Partitioner::Static: return "static_partitioner"; + case Partitioner::Simple: return "simple_partitioner"; + default: throw std::runtime_error("Error asking for name for non-existant device"); + } +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class TBBStream; +template class TBBStream; + diff --git a/TBBStream.hpp b/TBBStream.hpp new file mode 100644 index 0000000..dc87ea6 --- /dev/null +++ b/TBBStream.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "oneapi/tbb.h" +#include "Stream.h" + +#define IMPLEMENTATION_STRING "TBB" + +enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; + +template +class TBBStream : public Stream +{ + protected: + + + Partitioner partitioner; + tbb::blocked_range range; + // Device side pointers + std::vector a; + std::vector b; + std::vector c; + + + template < typename U, typename F> + U with_partitioner(const F &f); + + template + void parallel_for(const F &f); + + template + T parallel_reduce(T init, const Op &op, const F &f); + + public: + TBBStream(const int, int); + ~TBBStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/ci-prepare-bionic.sh b/ci-prepare-bionic.sh index 290e87b..fb69c05 100755 --- a/ci-prepare-bionic.sh +++ b/ci-prepare-bionic.sh @@ -208,6 +208,20 @@ setup_raja() { check_size } +setup_tbb() { + echo "Preparing TBB" + local tbb_ver="2021.2.0" + local tarball="oneapi-tbb-$tbb_ver-lin.tgz" + + local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" + # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz" + + get_and_untar "$tarball" "$url" + export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver" + verify_dir_exists "$TBB_LIB" + check_size +} + setup_clang_gcc() { echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list @@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then setup_dpcpp & setup_kokkos & setup_raja & + setup_tbb & wait else setup_cmake @@ -364,6 +379,7 @@ else setup_dpcpp setup_kokkos setup_raja + setup_tbb # these need apt setup_clang_gcc setup_rocm diff --git a/ci-test-compile.sh b/ci-test-compile.sh index 1b5c1bb..24a7091 100755 --- a/ci-test-compile.sh +++ b/ci-test-compile.sh @@ -112,6 +112,8 @@ run_build() { #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" # +#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" +# #GCC_STD_PAR_LIB="tbb" #CLANG_STD_PAR_LIB="tbb" #GCC_OMP_OFFLOAD_AMD=false @@ -138,7 +140,7 @@ build_gcc() { # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - + run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" @@ -188,6 +190,7 @@ build_clang() { run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } diff --git a/main.cpp b/main.cpp index e78d7a1..de301ce 100644 --- a/main.cpp +++ b/main.cpp @@ -25,6 +25,8 @@ #include "STDStream.h" #elif defined(STD20) #include "STD20Stream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -266,6 +268,10 @@ void run() // Use the C++20 implementation stream = new STD20Stream(ARRAY_SIZE, deviceIndex); +#elif defined(TBB) + // Use the C++20 implementation + stream = new TBBStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex);