Merge pull request #105 from UoB-HPC/tbb

Initial TBB implementation
This commit is contained in:
Tom Deakin 2021-06-03 16:07:38 +01:00 committed by GitHub
commit dd90598e20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 348 additions and 33 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@ kokkos-stream
std-stream std-stream
sycl-stream sycl-stream
hip-stream hip-stream
tbb-stream
*.o *.o
*.bc *.bc

View File

@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
register_model(ACC ACC ACCStream.cpp) register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")

View File

@ -19,6 +19,7 @@ Currently implemented are:
- Kokkos - Kokkos
- RAJA - RAJA
- SYCL - SYCL
- TBB
This code was previously called GPU-STREAM. This code was previously called GPU-STREAM.
@ -90,7 +91,7 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS: -- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent) Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA -- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
-- Selected model : OCL -- Selected model : OCL
-- Supported flags: -- Supported flags:

29
TBB.cmake Normal file
View File

@ -0,0 +1,29 @@
register_flag_optional(ONE_TBB_DIR
"Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
"")
register_flag_optional(PARTITIONER
"Partitioner specifies how a loop template should partition its work among threads.
Possible values are:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
"AUTO")
macro(setup)
if(ONE_TBB_DIR)
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
# docs on Intel's website refers to TBB_DIR which is not correct
endif()
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
register_definitions(PARTITIONER_${PARTITIONER})
endmacro()

56
TBB.make Normal file
View File

@ -0,0 +1,56 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL INTEL_LEGACY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_GNU = g++
CXX_INTEL = icpx
CXX_INTEL_LEGACY = icpc
CXX = $(COMPILER_$(COMPILER))
CXXFLAGS_GNU = -march=native
CXXFLAGS_INTEL = -march=native
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
ifndef PARTITIONER
define partitioner_help
Set PARTITIONER to select TBB's partitioner.
Partitioner specifies how a loop template should partition its work among threads.
Available options:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
for more details.
endef
$(info $(partitioner_help))
PARTITIONER=AUTO
endif
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

134
TBBStream.cpp Normal file
View File

@ -0,0 +1,134 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "TBBStream.hpp"
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
{
if(device != 0){
throw std::runtime_error("Device != 0 is not supported by TBB");
}
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
}
template <class T>
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
}, partitioner);
}
template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
}
template <class T>
void TBBStream<T>::copy()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::mul()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
b[i] = scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::add()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i] + b[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::triad()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::nstream()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] += b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
T TBBStream<T>::dot()
{
// sum += a[i] * b[i];
return
tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
for (size_t i = r.begin(); i < r.end(); ++i) {
acc += a[i] * b[i];
}
return acc;
}, std::plus<T>(), partitioner);
}
void listDevices(void)
{
std::cout << "Listing devices is not supported by TBB" << std::endl;
}
std::string getDeviceName(const int device)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class TBBStream<float>;
template class TBBStream<double>;

62
TBBStream.hpp Normal file
View File

@ -0,0 +1,62 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include "tbb/tbb.h"
#include "Stream.h"
#define IMPLEMENTATION_STRING "TBB"
#if defined(PARTITIONER_AUTO)
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#elif defined(PARTITIONER_AFFINITY)
using tbb_partitioner = tbb::affinity_partitioner;
#define PARTITIONER_NAME "affinity_partitioner"
#elif defined(PARTITIONER_STATIC)
using tbb_partitioner = tbb::static_partitioner;
#define PARTITIONER_NAME "static_partitioner"
#elif defined(PARTITIONER_SIMPLE)
using tbb_partitioner = tbb::simple_partitioner;
#define PARTITIONER_NAME "simple_partitioner"
#else
// default to auto
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#endif
template <class T>
class TBBStream : public Stream<T>
{
protected:
tbb_partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
public:
TBBStream(const int, int);
~TBBStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -208,6 +208,20 @@ setup_raja() {
check_size check_size
} }
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
get_and_untar "$tarball" "$url"
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
verify_dir_exists "$TBB_LIB"
check_size
}
setup_clang_gcc() { setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
setup_dpcpp & setup_dpcpp &
setup_kokkos & setup_kokkos &
setup_raja & setup_raja &
setup_tbb &
wait wait
else else
setup_cmake setup_cmake
@ -364,6 +379,7 @@ else
setup_dpcpp setup_dpcpp
setup_kokkos setup_kokkos
setup_raja setup_raja
setup_tbb
# these need apt # these need apt
setup_clang_gcc setup_clang_gcc
setup_rocm setup_rocm

View File

@ -86,38 +86,40 @@ run_build() {
} }
### ###
#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" # KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" # RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
#
#GCC_CXX="/usr/bin/g++" # GCC_CXX="/usr/bin/g++"
#CLANG_CXX="/usr/bin/clang++" # CLANG_CXX="/usr/bin/clang++"
#
#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" # NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" # NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" # NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" # NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x # "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
#
#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
#AOMP_CXX="/usr/lib/aomp/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++"
#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" # OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
#
## AMD needs this rocm_path thing exported... # # AMD needs this rocm_path thing exported...
#export ROCM_PATH="/opt/rocm-4.0.0" # export ROCM_PATH="/opt/rocm-4.0.0"
#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" # HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" # COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" # DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
#HIPSYCL_DIR="/opt/hipsycl/cff515c/" # HIPSYCL_DIR="/opt/hipsycl/cff515c/"
#
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" # ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" # ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
#
#GCC_STD_PAR_LIB="tbb" # TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
#CLANG_STD_PAR_LIB="tbb"
#GCC_OMP_OFFLOAD_AMD=false # GCC_STD_PAR_LIB="tbb"
#GCC_OMP_OFFLOAD_NVIDIA=true # CLANG_STD_PAR_LIB="tbb"
#CLANG_OMP_OFFLOAD_AMD=false # GCC_OMP_OFFLOAD_AMD=false
#CLANG_OMP_OFFLOAD_NVIDIA=false # GCC_OMP_OFFLOAD_NVIDIA=true
# CLANG_OMP_OFFLOAD_AMD=false
# CLANG_OMP_OFFLOAD_NVIDIA=false
### ###
AMD_ARCH="gfx_903" AMD_ARCH="gfx_903"
@ -139,6 +141,9 @@ build_gcc() {
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -188,6 +193,10 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc # no clang /w RAJA+cuda because it needs nvcc which needs gcc
} }

View File

@ -25,6 +25,8 @@
#include "STDStream.h" #include "STDStream.h"
#elif defined(STD20) #elif defined(STD20)
#include "STD20Stream.hpp" #include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -266,6 +268,10 @@ void run()
// Use the C++20 implementation // Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex); stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(TBB)
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);