Merge branch 'main' into rust

2021-06-10 05:37:03 +01:00 · 2021-06-10 05:37:03 +01:00 · c70a5da45b
commit c70a5da45b
parent 2ff883f2f7 25e021caa3
14 changed files with 383 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ kokkos-stream
 std-stream
 sycl-stream
 hip-stream
 tbb-stream
 *.o
 *.bc
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,8 @@ All notable changes to this project will be documented in this file.
 - Support for CUDA Managed Memory and Page Fault memory.
 - Added nstream kernel from PRK with associate command line option.
 - CMake build system added for all models.
 - SYCL device check for FP64 support.
 - New implementation using TBB.
 ### Changed
 - Default branch renamed from `master` to `main`.
@ -29,6 +31,7 @@ All notable changes to this project will be documented in this file.
 - Unified run function in driver code to reduce code duplication, output should be uneffected.
 - Normalise sum result by expected value to help false negative errors.
 - HC version deprecated and moved to a legacy directory.
 - Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).
 ### Removed
 - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
 register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
 register_model(TBB TBB TBBStream.cpp)
 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -188,3 +189,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
 if (COMMAND setup_target)
    setup_target(${EXE_NAME})
 endif ()
 install (TARGETS ${EXE_NAME} DESTINATION bin)
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include <cstdlib>  // For aligned_alloc
 #include "OMPStream.h"
 #ifndef ALIGNMENT
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include <cstdlib>  // For aligned_alloc
 #include <stdexcept>
 #include "RAJAStream.hpp"
--- a/README.md
+++ b/README.md
@ -19,6 +19,7 @@ Currently implemented are:
  - Kokkos
  - RAJA
  - SYCL
  - TBB
 This code was previously called GPU-STREAM.
@ -90,7 +91,7 @@ For example:
        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) 
 -- CXX_EXTRA_LINKER_FLAGS: 
        Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
+-- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
 -- Selected model  :  OCL
 -- Supported flags:
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
    throw std::runtime_error("Invalid device index");
  device dev = devices[device_index];
  // Check device can support FP64 if needed
  if (sizeof(T) == sizeof(double))
  {
    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
      throw std::runtime_error("Device does not support double precision, please use --float");
    }
  }
  // Determine sensible dot kernel NDRange configuration
  if (dev.is_cpu())
  {
--- a/TBB.cmake
+++ b/TBB.cmake
@ -0,0 +1,29 @@
 register_flag_optional(ONE_TBB_DIR
        "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
        "")
 register_flag_optional(PARTITIONER
        "Partitioner specifies how a loop template should partition its work among threads.
         Possible values are:
            AUTO     - Optimize range subdivision based on work-stealing events.
            AFFINITY - Proportional splitting that optimizes for cache affinity.
            STATIC   - Distribute work uniformly with no additional load balancing.
            SIMPLE   - Recursively split its range until it cannot be further subdivided.
            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
        "AUTO")
 macro(setup)
    if(ONE_TBB_DIR)
        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
        # docs on Intel's website refers to TBB_DIR which is not correct
    endif()
    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
    find_package(TBB REQUIRED)
    register_link_library(TBB::tbb)
    register_definitions(PARTITIONER_${PARTITIONER})
 endmacro()
--- a/TBB.make
+++ b/TBB.make
@ -0,0 +1,56 @@
 ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
  GNU INTEL INTEL_LEGACY
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 CXX_GNU          = g++
 CXX_INTEL        = icpx
 CXX_INTEL_LEGACY = icpc
 CXX = $(COMPILER_$(COMPILER))
 CXXFLAGS_GNU          = -march=native
 CXXFLAGS_INTEL        = -march=native
 CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
 CXX = $(CXX_$(COMPILER))
 CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
 ifndef PARTITIONER
 define partitioner_help
 Set PARTITIONER to select TBB's partitioner.
 Partitioner specifies how a loop template should partition its work among threads.
 Available options:
  AUTO     - Optimize range subdivision based on work-stealing events.
  AFFINITY - Proportional splitting that optimizes for cache affinity.
  STATIC   - Distribute work uniformly with no additional load balancing.
  SIMPLE   - Recursively split its range until it cannot be further subdivided.
 See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
 for more details.
 endef
 $(info $(partitioner_help))
 PARTITIONER=AUTO
 endif
 PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
 tbb-stream: main.cpp TBBStream.cpp
 	$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
 .PHONY: clean
 clean:
 	rm -f tbb-stream
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@ -0,0 +1,134 @@
 // Copyright (c) 2020 Tom Deakin
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "TBBStream.hpp"
 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
 : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 {
  if(device != 0){
    throw std::runtime_error("Device != 0 is not supported by TBB");
  }
  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
 }
 template <class T>
 void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 {
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
      a[i] = initA;
      b[i] = initB;
      c[i] = initC;
    }
  }, partitioner);
 }
 template <class T>
 void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  // Element-wise copy.
  h_a = a;
  h_b = b;
  h_c = c;
 }
 template <class T>
 void TBBStream<T>::copy()
 {
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
       c[i] = a[i];
    }
  }, partitioner);
 }
 template <class T>
 void TBBStream<T>::mul()
 {
  const T scalar = startScalar;
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
       b[i] = scalar * c[i];
    }
  }, partitioner);
 }
 template <class T>
 void TBBStream<T>::add()
 {
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
       c[i] = a[i] + b[i];
    }
  }, partitioner);
 }
 template <class T>
 void TBBStream<T>::triad()
 {
  const T scalar = startScalar;
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
       a[i] = b[i] + scalar * c[i];
    }
  }, partitioner);
 }
 template <class T>
 void TBBStream<T>::nstream()
 {
  const T scalar = startScalar;
  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
    for (size_t i = r.begin(); i < r.end(); ++i) {
       a[i] += b[i] + scalar * c[i];
    }
  }, partitioner);
 }
 template <class T>
 T TBBStream<T>::dot()
 {
  // sum += a[i] * b[i];
  return
    tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
      for (size_t i = r.begin(); i < r.end(); ++i) {
        acc += a[i] * b[i];
      }
      return acc;
    }, std::plus<T>(), partitioner);
 }
 void listDevices(void)
 {
   std::cout << "Listing devices is not supported by TBB" << std::endl;
 }
 std::string getDeviceName(const int device)
 {
  return std::string("Device name unavailable");
 }
 std::string getDeviceDriver(const int)
 {
  return std::string("Device driver unavailable");
 }
 template class TBBStream<float>;
 template class TBBStream<double>;
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@ -0,0 +1,62 @@
 // Copyright (c) 2020 Tom Deakin
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <vector>
 #include "tbb/tbb.h"
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "TBB"
 #if defined(PARTITIONER_AUTO)
 using tbb_partitioner = tbb::auto_partitioner;
 #define PARTITIONER_NAME  "auto_partitioner"
 #elif defined(PARTITIONER_AFFINITY)
 using tbb_partitioner = tbb::affinity_partitioner;
 #define PARTITIONER_NAME  "affinity_partitioner"
 #elif defined(PARTITIONER_STATIC)
 using tbb_partitioner = tbb::static_partitioner;
 #define PARTITIONER_NAME  "static_partitioner"
 #elif defined(PARTITIONER_SIMPLE)
 using tbb_partitioner = tbb::simple_partitioner;
 #define PARTITIONER_NAME  "simple_partitioner"
 #else
 // default to auto
 using tbb_partitioner = tbb::auto_partitioner;
 #define PARTITIONER_NAME  "auto_partitioner"
 #endif
 template <class T>
 class TBBStream : public Stream<T>
 {
  protected:
    tbb_partitioner partitioner;
    tbb::blocked_range<size_t> range;
    // Device side pointers
    std::vector<T> a;
    std::vector<T> b;
    std::vector<T> c;
  public:
    TBBStream(const int, int);
    ~TBBStream() = default;
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void nstream() override;
    virtual T dot() override;
    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@ -208,6 +208,20 @@ setup_raja() {
  check_size
 }
 setup_tbb() {
  echo "Preparing TBB"
  local tbb_ver="2021.2.0"
  local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
  local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
  # local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
  get_and_untar "$tarball" "$url"
  export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
  verify_dir_exists "$TBB_LIB"
  check_size
 }
 setup_clang_gcc() {
  echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -240,8 +254,7 @@ setup_rocm() {
  echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
  sudo apt-get update -qq
  sudo apt-get install -y -qq rocm-dev
-  # AMD needs this rocm_path thing exported...
+  export_var ROCM_PATH "/opt/rocm"
  export_var ROCM_PATH "/opt/rocm-4.1.0"
  export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
  verify_bin_exists "$HIP_CXX"
  "$HIP_CXX" --version
@ -355,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
  setup_dpcpp &
  setup_kokkos &
  setup_raja &
  setup_tbb &
  wait
 else
  setup_cmake
@ -365,6 +379,7 @@ else
  setup_dpcpp
  setup_kokkos
  setup_raja
  setup_tbb
  # these need apt
  setup_clang_gcc
  setup_rocm
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@ -44,21 +44,26 @@ run_build() {
  rm -rf "$build"
  set +e
  local install_dir="$build/install"
  # shellcheck disable=SC2086
  "$CMAKE_BIN" -B"$build" -H. \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_VERBOSE_MAKEFILE=ON \
    -DCMAKE_INSTALL_PREFIX="$install_dir" \
    -DMODEL="$model" $flags &>>"$log"
  local model_lower=$(echo "$model" | awk '{print tolower($0)}')
  local cmake_code=$?
  "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
  "$CMAKE_BIN" --build "$build" --target install  -j "$(nproc)" &>>"$log"
  local cmake_code=$?
  set -e
  local bin="./$build/$model_lower-stream"
  local installed_bin="./$install_dir/bin/$model_lower-stream"
  echo "Checking for final executable: $bin"
  if [[ -f "$bin" ]]; then
    echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
@ -66,6 +71,11 @@ run_build() {
    cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/    $(tput setaf 3)&$(tput sgr0)/"
    if [[ ! -f "$installed_bin" ]]; then
      echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)"
      cat "$log"
      exit 1
    fi
  else
    echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
    echo "      $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
@ -78,30 +88,32 @@ run_build() {
 ###
 # KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
 # RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
-#
+
 # GCC_CXX="/usr/bin/g++"
 # CLANG_CXX="/usr/bin/clang++"
-#
+
 # NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
 # NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
 # NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
 # NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
 # "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
-#
+
 # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
 # AOMP_CXX="/usr/lib/aomp/bin/clang++"
 # OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
-#
+
 # # AMD needs this rocm_path thing exported...
 # export ROCM_PATH="/opt/rocm-4.0.0"
 # HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
 # COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
 # DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
 # HIPSYCL_DIR="/opt/hipsycl/cff515c/"
-#
+
 # ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
 # ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
-#
+
 # TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
 # GCC_STD_PAR_LIB="tbb"
 # CLANG_STD_PAR_LIB="tbb"
 # GCC_OMP_OFFLOAD_AMD=false
@ -129,6 +141,9 @@ build_gcc() {
  run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
  run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
  if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
    run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
    run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -146,11 +161,15 @@ build_gcc() {
  run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
-  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
+
-  -DENABLE_CUDA=ON \
+#  FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
-  -DTARGET=NVIDIA \
+#  FIXME we also got https://github.com/NVIDIA/nccl/issues/494
-  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
+
-  -DCUDA_ARCH=$NV_ARCH"
+#  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
 #  -DENABLE_CUDA=ON \
 #  -DTARGET=NVIDIA \
 #  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
 #  -DCUDA_ARCH=$NV_ARCH"
 }
@ -174,6 +193,10 @@ build_clang() {
  run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
  # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
  run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
  run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
  run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
--- a/main.cpp
+++ b/main.cpp
@ -25,6 +25,8 @@
 #include "STDStream.h"
 #elif defined(STD20)
 #include "STD20Stream.hpp"
 #elif defined(TBB)
 #include "TBBStream.hpp"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -266,6 +268,10 @@ void run()
  // Use the C++20 implementation
  stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(TBB)
  // Use the C++20 implementation
  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);