Initial TBB implementation

This commit is contained in:
Tom Lin 2021-05-27 09:28:40 +01:00
parent 2ab68ab39e
commit 742f0629be
10 changed files with 281 additions and 2 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@ kokkos-stream
std-stream std-stream
sycl-stream sycl-stream
hip-stream hip-stream
tbb-stream
*.o *.o
*.bc *.bc

View File

@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
register_model(ACC ACC ACCStream.cpp) register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

View File

@ -19,6 +19,7 @@ Currently implemented are:
- Kokkos - Kokkos
- RAJA - RAJA
- SYCL - SYCL
- TBB
This code was previously called GPU-STREAM. This code was previously called GPU-STREAM.
@ -90,7 +91,7 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS: -- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent) Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA -- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
-- Selected model : OCL -- Selected model : OCL
-- Supported flags: -- Supported flags:

10
TBB.cmake Normal file
View File

@ -0,0 +1,10 @@
register_flag_required(TBB_DIR
"Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/")
macro(setup)
set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
endmacro()

28
TBB.make Normal file
View File

@ -0,0 +1,28 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU
endef
$(info $(compiler_help))
COMPILER=GNU
endif
TBB_LIB=
COMPILER_GNU = g++
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++14 -march=native
CXXFLAGS = $(FLAGS_$(COMPILER))
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

157
TBBStream.cpp Normal file
View File

@ -0,0 +1,157 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "TBBStream.hpp"
#include "oneapi/tbb.h"
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
{
std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
}
template <class T>
template <typename U, typename F>
U TBBStream<T>::with_partitioner(const F &f)
{
switch(partitioner){
case Partitioner::Auto: return f(tbb::auto_partitioner{});
case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here
case Partitioner::Static: return f(tbb::static_partitioner{});
case Partitioner::Simple: return f(tbb::simple_partitioner{});
default: throw std::runtime_error("Error asking for name for non-existant device");
}
}
template <class T>
template <typename F>
void TBBStream<T>::parallel_for(const F &f)
{
// using size_t as per the range type (also used in the official documentation)
with_partitioner<std::nullptr_t>([&](auto &&p) {
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
f(i);
}
}, p);
return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
});
}
template <class T>
template <typename F, typename Op>
T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f)
{
return with_partitioner<T>([&](auto &&p) {
return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
for (size_t i = r.begin(); i < r.end(); ++i) {
acc = op(acc, f(i));
}
return acc;
}, op, p);
});
}
template <class T>
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
{
parallel_for([&](size_t i){
a[i] = initA;
b[i] = initB;
c[i] = initC;
});
}
template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
}
template <class T>
void TBBStream<T>::copy()
{
parallel_for([&](size_t i){ c[i] = a[i]; });
}
template <class T>
void TBBStream<T>::mul()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
}
template <class T>
void TBBStream<T>::add()
{
parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
}
template <class T>
void TBBStream<T>::triad()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
}
template <class T>
void TBBStream<T>::nstream()
{
const T scalar = startScalar;
parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
}
template <class T>
T TBBStream<T>::dot()
{
// sum += a[i] * b[i];
return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
}
void listDevices(void)
{
std::cout
<< "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n"
<< "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n"
<< "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n"
<< "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n"
<< "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details"
<< std::endl;
}
std::string getDeviceName(const int device)
{
switch(static_cast<Partitioner>(device)){
case Partitioner::Auto: return "auto_partitioner";
case Partitioner::Affinity: return "affinity_partitioner";
case Partitioner::Static: return "static_partitioner";
case Partitioner::Simple: return "simple_partitioner";
default: throw std::runtime_error("Error asking for name for non-existant device");
}
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class TBBStream<float>;
template class TBBStream<double>;

56
TBBStream.hpp Normal file
View File

@ -0,0 +1,56 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include "oneapi/tbb.h"
#include "Stream.h"
#define IMPLEMENTATION_STRING "TBB"
enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
template <class T>
class TBBStream : public Stream<T>
{
protected:
Partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
template < typename U, typename F>
U with_partitioner(const F &f);
template <typename F>
void parallel_for(const F &f);
template <typename F, typename Op>
T parallel_reduce(T init, const Op &op, const F &f);
public:
TBBStream(const int, int);
~TBBStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -208,6 +208,20 @@ setup_raja() {
check_size check_size
} }
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
get_and_untar "$tarball" "$url"
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
verify_dir_exists "$TBB_LIB"
check_size
}
setup_clang_gcc() { setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
setup_dpcpp & setup_dpcpp &
setup_kokkos & setup_kokkos &
setup_raja & setup_raja &
setup_tbb &
wait wait
else else
setup_cmake setup_cmake
@ -364,6 +379,7 @@ else
setup_dpcpp setup_dpcpp
setup_kokkos setup_kokkos
setup_raja setup_raja
setup_tbb
# these need apt # these need apt
setup_clang_gcc setup_clang_gcc
setup_rocm setup_rocm

View File

@ -112,6 +112,8 @@ run_build() {
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
# #
#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
#
#GCC_STD_PAR_LIB="tbb" #GCC_STD_PAR_LIB="tbb"
#CLANG_STD_PAR_LIB="tbb" #CLANG_STD_PAR_LIB="tbb"
#GCC_OMP_OFFLOAD_AMD=false #GCC_OMP_OFFLOAD_AMD=false
@ -138,7 +140,7 @@ build_gcc() {
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -188,6 +190,7 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc # no clang /w RAJA+cuda because it needs nvcc which needs gcc
} }

View File

@ -25,6 +25,8 @@
#include "STDStream.h" #include "STDStream.h"
#elif defined(STD20) #elif defined(STD20)
#include "STD20Stream.hpp" #include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -266,6 +268,10 @@ void run()
// Use the C++20 implementation // Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex); stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(TBB)
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);