Initial TBB implementation
This commit is contained in:
parent
2ab68ab39e
commit
742f0629be
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,6 +8,7 @@ kokkos-stream
|
|||||||
std-stream
|
std-stream
|
||||||
sycl-stream
|
sycl-stream
|
||||||
hip-stream
|
hip-stream
|
||||||
|
tbb-stream
|
||||||
|
|
||||||
*.o
|
*.o
|
||||||
*.bc
|
*.bc
|
||||||
|
|||||||
@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
|
|||||||
register_model(ACC ACC ACCStream.cpp)
|
register_model(ACC ACC ACCStream.cpp)
|
||||||
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
||||||
register_model(RAJA USE_RAJA RAJAStream.cpp)
|
register_model(RAJA USE_RAJA RAJAStream.cpp)
|
||||||
|
register_model(TBB TBB TBBStream.cpp)
|
||||||
|
|
||||||
|
|
||||||
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
||||||
|
|||||||
@ -19,6 +19,7 @@ Currently implemented are:
|
|||||||
- Kokkos
|
- Kokkos
|
||||||
- RAJA
|
- RAJA
|
||||||
- SYCL
|
- SYCL
|
||||||
|
- TBB
|
||||||
|
|
||||||
This code was previously called GPU-STREAM.
|
This code was previously called GPU-STREAM.
|
||||||
|
|
||||||
@ -90,7 +91,7 @@ For example:
|
|||||||
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
|
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
|
||||||
-- CXX_EXTRA_LINKER_FLAGS:
|
-- CXX_EXTRA_LINKER_FLAGS:
|
||||||
Append to linker flags (i.e GCC's `-Wl` or equivalent)
|
Append to linker flags (i.e GCC's `-Wl` or equivalent)
|
||||||
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
|
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
|
||||||
-- Selected model : OCL
|
-- Selected model : OCL
|
||||||
-- Supported flags:
|
-- Supported flags:
|
||||||
|
|
||||||
|
|||||||
10
TBB.cmake
Normal file
10
TBB.cmake
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
|
||||||
|
register_flag_required(TBB_DIR
|
||||||
|
"Absolute path to oneTBB distribution, the directory should contains at least `include/` and `lib/")
|
||||||
|
|
||||||
|
macro(setup)
|
||||||
|
set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
|
||||||
|
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
|
||||||
|
find_package(TBB REQUIRED)
|
||||||
|
register_link_library(TBB::tbb)
|
||||||
|
endmacro()
|
||||||
28
TBB.make
Normal file
28
TBB.make
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
GNU
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
TBB_LIB=
|
||||||
|
|
||||||
|
COMPILER_GNU = g++
|
||||||
|
CXX = $(COMPILER_$(COMPILER))
|
||||||
|
|
||||||
|
FLAGS_GNU = -O3 -std=c++14 -march=native
|
||||||
|
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
|
||||||
|
tbb-stream: main.cpp TBBStream.cpp
|
||||||
|
$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f tbb-stream
|
||||||
|
|
||||||
157
TBBStream.cpp
Normal file
157
TBBStream.cpp
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
// Copyright (c) 2020 Tom Deakin
|
||||||
|
// University of Bristol HPC
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#include "TBBStream.hpp"
|
||||||
|
#include "oneapi/tbb.h"
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
|
||||||
|
: partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
template <typename U, typename F>
|
||||||
|
U TBBStream<T>::with_partitioner(const F &f)
|
||||||
|
{
|
||||||
|
switch(partitioner){
|
||||||
|
case Partitioner::Auto: return f(tbb::auto_partitioner{});
|
||||||
|
case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here
|
||||||
|
case Partitioner::Static: return f(tbb::static_partitioner{});
|
||||||
|
case Partitioner::Simple: return f(tbb::simple_partitioner{});
|
||||||
|
default: throw std::runtime_error("Error asking for name for non-existant device");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
template <typename F>
|
||||||
|
void TBBStream<T>::parallel_for(const F &f)
|
||||||
|
{
|
||||||
|
// using size_t as per the range type (also used in the official documentation)
|
||||||
|
with_partitioner<std::nullptr_t>([&](auto &&p) {
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
f(i);
|
||||||
|
}
|
||||||
|
}, p);
|
||||||
|
return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
template <typename F, typename Op>
|
||||||
|
T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f)
|
||||||
|
{
|
||||||
|
return with_partitioner<T>([&](auto &&p) {
|
||||||
|
return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
acc = op(acc, f(i));
|
||||||
|
}
|
||||||
|
return acc;
|
||||||
|
}, op, p);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
|
||||||
|
{
|
||||||
|
|
||||||
|
parallel_for([&](size_t i){
|
||||||
|
a[i] = initA;
|
||||||
|
b[i] = initB;
|
||||||
|
c[i] = initC;
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||||
|
{
|
||||||
|
// Element-wise copy.
|
||||||
|
h_a = a;
|
||||||
|
h_b = b;
|
||||||
|
h_c = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::copy()
|
||||||
|
{
|
||||||
|
parallel_for([&](size_t i){ c[i] = a[i]; });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::mul()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::add()
|
||||||
|
{
|
||||||
|
|
||||||
|
parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::triad()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::nstream()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
T TBBStream<T>::dot()
|
||||||
|
{
|
||||||
|
// sum += a[i] * b[i];
|
||||||
|
return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
|
||||||
|
}
|
||||||
|
|
||||||
|
void listDevices(void)
|
||||||
|
{
|
||||||
|
std::cout
|
||||||
|
<< "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n"
|
||||||
|
<< "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n"
|
||||||
|
<< "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n"
|
||||||
|
<< "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n"
|
||||||
|
<< "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details"
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceName(const int device)
|
||||||
|
{
|
||||||
|
switch(static_cast<Partitioner>(device)){
|
||||||
|
case Partitioner::Auto: return "auto_partitioner";
|
||||||
|
case Partitioner::Affinity: return "affinity_partitioner";
|
||||||
|
case Partitioner::Static: return "static_partitioner";
|
||||||
|
case Partitioner::Simple: return "simple_partitioner";
|
||||||
|
default: throw std::runtime_error("Error asking for name for non-existant device");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceDriver(const int)
|
||||||
|
{
|
||||||
|
return std::string("Device driver unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
template class TBBStream<float>;
|
||||||
|
template class TBBStream<double>;
|
||||||
|
|
||||||
56
TBBStream.hpp
Normal file
56
TBBStream.hpp
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
// Copyright (c) 2020 Tom Deakin
|
||||||
|
// University of Bristol HPC
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include "oneapi/tbb.h"
|
||||||
|
#include "Stream.h"
|
||||||
|
|
||||||
|
#define IMPLEMENTATION_STRING "TBB"
|
||||||
|
|
||||||
|
enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
class TBBStream : public Stream<T>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
|
||||||
|
|
||||||
|
Partitioner partitioner;
|
||||||
|
tbb::blocked_range<size_t> range;
|
||||||
|
// Device side pointers
|
||||||
|
std::vector<T> a;
|
||||||
|
std::vector<T> b;
|
||||||
|
std::vector<T> c;
|
||||||
|
|
||||||
|
|
||||||
|
template < typename U, typename F>
|
||||||
|
U with_partitioner(const F &f);
|
||||||
|
|
||||||
|
template <typename F>
|
||||||
|
void parallel_for(const F &f);
|
||||||
|
|
||||||
|
template <typename F, typename Op>
|
||||||
|
T parallel_reduce(T init, const Op &op, const F &f);
|
||||||
|
|
||||||
|
public:
|
||||||
|
TBBStream(const int, int);
|
||||||
|
~TBBStream() = default;
|
||||||
|
|
||||||
|
virtual void copy() override;
|
||||||
|
virtual void add() override;
|
||||||
|
virtual void mul() override;
|
||||||
|
virtual void triad() override;
|
||||||
|
virtual void nstream() override;
|
||||||
|
virtual T dot() override;
|
||||||
|
|
||||||
|
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||||
|
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
@ -208,6 +208,20 @@ setup_raja() {
|
|||||||
check_size
|
check_size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setup_tbb() {
|
||||||
|
echo "Preparing TBB"
|
||||||
|
local tbb_ver="2021.2.0"
|
||||||
|
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
|
||||||
|
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
|
||||||
|
get_and_untar "$tarball" "$url"
|
||||||
|
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
|
||||||
|
verify_dir_exists "$TBB_LIB"
|
||||||
|
check_size
|
||||||
|
}
|
||||||
|
|
||||||
setup_clang_gcc() {
|
setup_clang_gcc() {
|
||||||
|
|
||||||
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
||||||
@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
|
|||||||
setup_dpcpp &
|
setup_dpcpp &
|
||||||
setup_kokkos &
|
setup_kokkos &
|
||||||
setup_raja &
|
setup_raja &
|
||||||
|
setup_tbb &
|
||||||
wait
|
wait
|
||||||
else
|
else
|
||||||
setup_cmake
|
setup_cmake
|
||||||
@ -364,6 +379,7 @@ else
|
|||||||
setup_dpcpp
|
setup_dpcpp
|
||||||
setup_kokkos
|
setup_kokkos
|
||||||
setup_raja
|
setup_raja
|
||||||
|
setup_tbb
|
||||||
# these need apt
|
# these need apt
|
||||||
setup_clang_gcc
|
setup_clang_gcc
|
||||||
setup_rocm
|
setup_rocm
|
||||||
|
|||||||
@ -112,6 +112,8 @@ run_build() {
|
|||||||
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
|
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
|
||||||
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
|
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
|
||||||
#
|
#
|
||||||
|
#TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
|
||||||
|
#
|
||||||
#GCC_STD_PAR_LIB="tbb"
|
#GCC_STD_PAR_LIB="tbb"
|
||||||
#CLANG_STD_PAR_LIB="tbb"
|
#CLANG_STD_PAR_LIB="tbb"
|
||||||
#GCC_OMP_OFFLOAD_AMD=false
|
#GCC_OMP_OFFLOAD_AMD=false
|
||||||
@ -138,7 +140,7 @@ build_gcc() {
|
|||||||
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
||||||
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||||
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||||
|
run_build $name "${GCC_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
|
||||||
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
||||||
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
||||||
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
||||||
@ -188,6 +190,7 @@ build_clang() {
|
|||||||
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||||
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||||
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
||||||
|
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DTBB_DIR=$TBB_LIB"
|
||||||
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||||
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
||||||
}
|
}
|
||||||
|
|||||||
6
main.cpp
6
main.cpp
@ -25,6 +25,8 @@
|
|||||||
#include "STDStream.h"
|
#include "STDStream.h"
|
||||||
#elif defined(STD20)
|
#elif defined(STD20)
|
||||||
#include "STD20Stream.hpp"
|
#include "STD20Stream.hpp"
|
||||||
|
#elif defined(TBB)
|
||||||
|
#include "TBBStream.hpp"
|
||||||
#elif defined(HIP)
|
#elif defined(HIP)
|
||||||
#include "HIPStream.h"
|
#include "HIPStream.h"
|
||||||
#elif defined(HC)
|
#elif defined(HC)
|
||||||
@ -266,6 +268,10 @@ void run()
|
|||||||
// Use the C++20 implementation
|
// Use the C++20 implementation
|
||||||
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|
||||||
|
#elif defined(TBB)
|
||||||
|
// Use the C++20 implementation
|
||||||
|
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|
||||||
#elif defined(ACC)
|
#elif defined(ACC)
|
||||||
// Use the OpenACC implementation
|
// Use the OpenACC implementation
|
||||||
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user