Merge pull request #105 from UoB-HPC/tbb

Initial TBB implementation
2021-06-03 16:07:38 +01:00 · 2021-06-03 16:07:38 +01:00 · dd90598e20
commit dd90598e20
parent 2ab68ab39e 0e3727d8f8
10 changed files with 348 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ kokkos-stream
 std-stream
 sycl-stream
 hip-stream
+tbb-stream

 *.o
 *.bc
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
 register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
+register_model(TBB TBB TBBStream.cpp)


 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
--- a/README.md
+++ b/README.md
@ -19,6 +19,7 @@ Currently implemented are:
  - Kokkos
  - RAJA
  - SYCL
+  - TBB

 This code was previously called GPU-STREAM.

@ -90,7 +91,7 @@ For example:
        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) 
 -- CXX_EXTRA_LINKER_FLAGS: 
        Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
+-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
 -- Selected model  :  OCL
 -- Supported flags:

--- a/TBB.cmake
+++ b/TBB.cmake
@ -0,0 +1,29 @@
+
+register_flag_optional(ONE_TBB_DIR
+        "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
+         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
+        "")
+
+
+register_flag_optional(PARTITIONER
+        "Partitioner specifies how a loop template should partition its work among threads.
+         Possible values are:
+            AUTO     - Optimize range subdivision based on work-stealing events.
+            AFFINITY - Proportional splitting that optimizes for cache affinity.
+            STATIC   - Distribute work uniformly with no additional load balancing.
+            SIMPLE   - Recursively split its range until it cannot be further subdivided.
+            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
+        "AUTO")
+
+macro(setup)
+    if(ONE_TBB_DIR)
+        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
+        # docs on Intel's website refers to TBB_DIR which is not correct
+    endif()
+    
+
+    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
+    find_package(TBB REQUIRED)
+    register_link_library(TBB::tbb)
+    register_definitions(PARTITIONER_${PARTITIONER})
+endmacro()
--- a/TBB.make
+++ b/TBB.make
@ -0,0 +1,56 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL INTEL_LEGACY
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+
+CXX_GNU          = g++
+CXX_INTEL        = icpx
+CXX_INTEL_LEGACY = icpc
+CXX = $(COMPILER_$(COMPILER))
+
+CXXFLAGS_GNU          = -march=native
+CXXFLAGS_INTEL        = -march=native
+CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
+
+
+
+ifndef PARTITIONER
+define partitioner_help
+Set PARTITIONER to select TBB's partitioner.
+Partitioner specifies how a loop template should partition its work among threads.
+
+Available options:
+  AUTO     - Optimize range subdivision based on work-stealing events.
+  AFFINITY - Proportional splitting that optimizes for cache affinity.
+  STATIC   - Distribute work uniformly with no additional load balancing.
+  SIMPLE   - Recursively split its range until it cannot be further subdivided.
+
+See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
+for more details.
+
+endef
+$(info $(partitioner_help))
+PARTITIONER=AUTO
+endif
+
+PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
+
+
+tbb-stream: main.cpp TBBStream.cpp
+	$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
+
+.PHONY: clean
+clean:
+	rm -f tbb-stream
+
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@ -0,0 +1,134 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "TBBStream.hpp"
+
+template <class T>
+TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
+ : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+{
+  if(device != 0){
+    throw std::runtime_error("Device != 0 is not supported by TBB");
+  }
+  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
+}
+
+
+template <class T>
+void TBBStream<T>::init_arrays(T initA, T initB, T initC)
+{
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+      a[i] = initA;
+      b[i] = initB;
+      c[i] = initC;
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  // Element-wise copy.
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void TBBStream<T>::copy()
+{
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i];
+    }
+  }, partitioner);
+}
+
+template <class T>
+void TBBStream<T>::mul()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       b[i] = scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::add()
+{
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i] + b[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::triad()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] = b[i] + scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+void TBBStream<T>::nstream()
+{
+  const T scalar = startScalar;
+
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] += b[i] + scalar * c[i];
+    }
+  }, partitioner);
+
+}
+
+template <class T>
+T TBBStream<T>::dot()
+{
+  // sum += a[i] * b[i];
+  return
+    tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
+      for (size_t i = r.begin(); i < r.end(); ++i) {
+        acc += a[i] * b[i];
+      }
+      return acc;
+    }, std::plus<T>(), partitioner);
+}
+
+void listDevices(void)
+{
+   std::cout << "Listing devices is not supported by TBB" << std::endl;
+}
+
+std::string getDeviceName(const int device)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+template class TBBStream<float>;
+template class TBBStream<double>;
+
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@ -0,0 +1,62 @@
+// Copyright (c) 2020 Tom Deakin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "tbb/tbb.h"
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "TBB"
+
+#if defined(PARTITIONER_AUTO)
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#elif defined(PARTITIONER_AFFINITY)
+using tbb_partitioner = tbb::affinity_partitioner;
+#define PARTITIONER_NAME  "affinity_partitioner"
+#elif defined(PARTITIONER_STATIC)
+using tbb_partitioner = tbb::static_partitioner;
+#define PARTITIONER_NAME  "static_partitioner"
+#elif defined(PARTITIONER_SIMPLE)
+using tbb_partitioner = tbb::simple_partitioner;
+#define PARTITIONER_NAME  "simple_partitioner"
+#else
+// default to auto
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#endif
+
+
+template <class T>
+class TBBStream : public Stream<T>
+{
+  protected:
+  
+    tbb_partitioner partitioner;
+    tbb::blocked_range<size_t> range;
+    // Device side pointers
+    std::vector<T> a;
+    std::vector<T> b;
+    std::vector<T> c;
+ 
+  public:
+    TBBStream(const int, int);
+    ~TBBStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@ -208,6 +208,20 @@ setup_raja() {
  check_size
 }

+setup_tbb() {
+  echo "Preparing TBB"
+  local tbb_ver="2021.2.0"
+  local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
+
+  local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
+  # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
+
+  get_and_untar "$tarball" "$url"
+  export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
+  verify_dir_exists "$TBB_LIB"
+  check_size
+}
+
 setup_clang_gcc() {

  echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -354,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
  setup_dpcpp &
  setup_kokkos &
  setup_raja &
+  setup_tbb &
  wait
 else
  setup_cmake
@ -364,6 +379,7 @@ else
  setup_dpcpp
  setup_kokkos
  setup_raja
+  setup_tbb
  # these need apt
  setup_clang_gcc
  setup_rocm
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@ -86,38 +86,40 @@ run_build() {
 }

 ###
-#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
-#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
-#
-#GCC_CXX="/usr/bin/g++"
-#CLANG_CXX="/usr/bin/clang++"
-#
-#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
-#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
-#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
-#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
-#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
-#
-#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
-#AOMP_CXX="/usr/lib/aomp/bin/clang++"
-#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
-#
-## AMD needs this rocm_path thing exported...
-#export ROCM_PATH="/opt/rocm-4.0.0"
-#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
-#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
-#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
-#HIPSYCL_DIR="/opt/hipsycl/cff515c/"
-#
-#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
-#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
-#
-#GCC_STD_PAR_LIB="tbb"
-#CLANG_STD_PAR_LIB="tbb"
-#GCC_OMP_OFFLOAD_AMD=false
-#GCC_OMP_OFFLOAD_NVIDIA=true
-#CLANG_OMP_OFFLOAD_AMD=false
-#CLANG_OMP_OFFLOAD_NVIDIA=false
+# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
+# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
+
+# GCC_CXX="/usr/bin/g++"
+# CLANG_CXX="/usr/bin/clang++"
+
+# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
+# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
+# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
+# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
+# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
+
+# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
+# AOMP_CXX="/usr/lib/aomp/bin/clang++"
+# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
+
+# # AMD needs this rocm_path thing exported...
+# export ROCM_PATH="/opt/rocm-4.0.0"
+# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
+# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
+# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
+# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
+
+# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
+# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
+
+# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
+
+# GCC_STD_PAR_LIB="tbb"
+# CLANG_STD_PAR_LIB="tbb"
+# GCC_OMP_OFFLOAD_AMD=false
+# GCC_OMP_OFFLOAD_NVIDIA=true
+# CLANG_OMP_OFFLOAD_AMD=false
+# CLANG_OMP_OFFLOAD_NVIDIA=false
 ###

 AMD_ARCH="gfx_903"
@ -139,6 +141,9 @@ build_gcc() {
  run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"

+  run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
  if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
    run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
    run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -188,6 +193,10 @@ build_clang() {
  run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
  # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
+  
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
+  run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
+
  run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
--- a/main.cpp
+++ b/main.cpp
@ -25,6 +25,8 @@
 #include "STDStream.h"
 #elif defined(STD20)
 #include "STD20Stream.hpp"
+#elif defined(TBB)
+#include "TBBStream.hpp"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -266,6 +268,10 @@ void run()
  // Use the C++20 implementation
  stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);

+#elif defined(TBB)
+  // Use the C++20 implementation
+  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
+
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);