Initial TBB implementation

This commit is contained in:
Tom Lin 2021-05-27 09:28:40 +01:00
parent 2ab68ab39e
commit 742f0629be
10 changed files with 281 additions and 2 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@ kokkos-stream
std-stream
sycl-stream
hip-stream
tbb-stream
*.o
*.bc

View File

@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

View File

@ -19,6 +19,7 @@ Currently implemented are:
- Kokkos
- RAJA
- SYCL
- TBB
This code was previously called GPU-STREAM.
@ -90,7 +91,7 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
-- Selected model : OCL
-- Supported flags:

10
TBB.cmake Normal file
View File

@ -0,0 +1,10 @@
register_flag_required(TBB_DIR
"Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/")
macro(setup)
set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
endmacro()

28
TBB.make Normal file
View File

@ -0,0 +1,28 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU
endef
$(info $(compiler_help))
COMPILER=GNU
endif
TBB_LIB=
COMPILER_GNU = g++
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++14 -march=native
CXXFLAGS = $(FLAGS_$(COMPILER))
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

157
TBBStream.cpp Normal file
View File

@ -0,0 +1,157 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "TBBStream.hpp"
#include "oneapi/tbb.h"
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
{
std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
}
template <class T>
template <typename U, typename F>
U TBBStream<T>::with_partitioner(const F &f)
{
switch(partitioner){
case Partitioner::Auto: return f(tbb::auto_partitioner{});
case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here
case Partitioner::Static: return f(tbb::static_partitioner{});
case Partitioner::Simple: return f(tbb::simple_partitioner{});
default: throw std::runtime_error("Error asking for name for non-existant device");
}
}
template <class T>
template <typename F>
void TBBStream<T>::parallel_for(const F &f)
{
// using size_t as per the range type (also used in the official documentation)
with_partitioner<std::nullptr_t>([&](auto &&p) {
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
f(i);
}
}, p);
return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
});
}
template <class T>
template <typename F, typename Op>
T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f)
{
return with_partitioner<T>([&](auto &&p) {
return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
for (size_t i = r.begin(); i < r.end(); ++i) {
acc = op(acc, f(i));
}
return acc;
}, op, p);
});
}
template <class T>
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
{
parallel_for([&](size_t i){
a[i] = initA;
b[i] = initB;
c[i] = initC;
});
}
template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
}
template <class T>
void TBBStream<T>::copy()
{
parallel_for([&](size_t i){ c[i] = a[i]; });
}
template <class T>
void TBBStream<T>::mul()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
}
template <class T>
void TBBStream<T>::add()
{
parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
}
template <class T>
void TBBStream<T>::triad()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
}
template <class T>
void TBBStream<T>::nstream()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
}
template <class T>
T TBBStream<T>::dot()
{
// sum += a[i] * b[i];
return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
}
void listDevices(void)
{
std::cout
<< "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n"
<< "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n"
<< "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n"
<< "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n"
<< "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details"
<< std::endl;
}
std::string getDeviceName(const int device)
{
switch(static_cast<Partitioner>(device)){
case Partitioner::Auto: return "auto_partitioner";
case Partitioner::Affinity: return "affinity_partitioner";
case Partitioner::Static: return "static_partitioner";
case Partitioner::Simple: return "simple_partitioner";
default: throw std::runtime_error("Error asking for name for non-existant device");
}
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class TBBStream<float>;
template class TBBStream<double>;

56
TBBStream.hpp Normal file
View File

@ -0,0 +1,56 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include "oneapi/tbb.h"
#include "Stream.h"
#define IMPLEMENTATION_STRING "TBB"
enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
template <class T>
class TBBStream : public Stream<T>
{
protected:
Partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
template < typename U, typename F>
U with_partitioner(const F &f);
template <typename F>
void parallel_for(const F &f);
template <typename F, typename Op>
T parallel_reduce(T init, const Op &op, const F &f);
public:
TBBStream(const int, int);
~TBBStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -208,6 +208,20 @@ setup_raja() {
check_size
}
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
get_and_untar "$tarball" "$url"
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
verify_dir_exists "$TBB_LIB"
check_size
}
setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
setup_dpcpp &
setup_kokkos &
setup_raja &
setup_tbb &
wait
else
setup_cmake
@ -364,6 +379,7 @@ else
setup_dpcpp
setup_kokkos
setup_raja
setup_tbb
# these need apt
setup_clang_gcc
setup_rocm

View File

@ -112,6 +112,8 @@ run_build() {
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
#
#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
#
#GCC_STD_PAR_LIB="tbb"
#CLANG_STD_PAR_LIB="tbb"
#GCC_OMP_OFFLOAD_AMD=false
@ -138,7 +140,7 @@ build_gcc() {
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -188,6 +190,7 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
}

View File

@ -25,6 +25,8 @@
#include "STDStream.h"
#elif defined(STD20)
#include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(HIP)
#include "HIPStream.h"
#elif defined(HC)
@ -266,6 +268,10 @@ void run()
// Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(TBB)
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);