Merge branch 'main' into rust

This commit is contained in:
Tom Lin 2021-06-10 05:37:03 +01:00
commit c70a5da45b
14 changed files with 383 additions and 40 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@ kokkos-stream
std-stream std-stream
sycl-stream sycl-stream
hip-stream hip-stream
tbb-stream
*.o *.o
*.bc *.bc

View File

@ -13,6 +13,8 @@ All notable changes to this project will be documented in this file.
- Support for CUDA Managed Memory and Page Fault memory. - Support for CUDA Managed Memory and Page Fault memory.
- Added nstream kernel from PRK with associate command line option. - Added nstream kernel from PRK with associate command line option.
- CMake build system added for all models. - CMake build system added for all models.
- SYCL device check for FP64 support.
- New implementation using TBB.
### Changed ### Changed
- Default branch renamed from `master` to `main`. - Default branch renamed from `master` to `main`.
@ -29,6 +31,7 @@ All notable changes to this project will be documented in this file.
- Unified run function in driver code to reduce code duplication, output should be uneffected. - Unified run function in driver code to reduce code duplication, output should be uneffected.
- Normalise sum result by expected value to help false negative errors. - Normalise sum result by expected value to help false negative errors.
- HC version deprecated and moved to a legacy directory. - HC version deprecated and moved to a legacy directory.
- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).
### Removed ### Removed
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.

View File

@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
register_model(ACC ACC ACCStream.cpp) register_model(ACC ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA # defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp) register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -188,3 +189,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
if (COMMAND setup_target) if (COMMAND setup_target)
setup_target(${EXE_NAME}) setup_target(${EXE_NAME})
endif () endif ()
install (TARGETS ${EXE_NAME} DESTINATION bin)

View File

@ -5,6 +5,7 @@
// For full license terms please see the LICENSE file distributed with this // For full license terms please see the LICENSE file distributed with this
// source code // source code
#include <cstdlib> // For aligned_alloc
#include "OMPStream.h" #include "OMPStream.h"
#ifndef ALIGNMENT #ifndef ALIGNMENT

View File

@ -5,6 +5,7 @@
// For full license terms please see the LICENSE file distributed with this // For full license terms please see the LICENSE file distributed with this
// source code // source code
#include <cstdlib> // For aligned_alloc
#include <stdexcept> #include <stdexcept>
#include "RAJAStream.hpp" #include "RAJAStream.hpp"

View File

@ -19,6 +19,7 @@ Currently implemented are:
- Kokkos - Kokkos
- RAJA - RAJA
- SYCL - SYCL
- TBB
This code was previously called GPU-STREAM. This code was previously called GPU-STREAM.
@ -90,7 +91,7 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS: -- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent) Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA -- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
-- Selected model : OCL -- Selected model : OCL
-- Supported flags: -- Supported flags:

View File

@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
throw std::runtime_error("Invalid device index"); throw std::runtime_error("Invalid device index");
device dev = devices[device_index]; device dev = devices[device_index];
// Check device can support FP64 if needed
if (sizeof(T) == sizeof(double))
{
if (dev.get_info<info::device::double_fp_config>().size() == 0) {
throw std::runtime_error("Device does not support double precision, please use --float");
}
}
// Determine sensible dot kernel NDRange configuration // Determine sensible dot kernel NDRange configuration
if (dev.is_cpu()) if (dev.is_cpu())
{ {

29
TBB.cmake Normal file
View File

@ -0,0 +1,29 @@
register_flag_optional(ONE_TBB_DIR
"Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
"")
register_flag_optional(PARTITIONER
"Partitioner specifies how a loop template should partition its work among threads.
Possible values are:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
"AUTO")
macro(setup)
if(ONE_TBB_DIR)
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
# docs on Intel's website refers to TBB_DIR which is not correct
endif()
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
find_package(TBB REQUIRED)
register_link_library(TBB::tbb)
register_definitions(PARTITIONER_${PARTITIONER})
endmacro()

56
TBB.make Normal file
View File

@ -0,0 +1,56 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL INTEL_LEGACY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_GNU = g++
CXX_INTEL = icpx
CXX_INTEL_LEGACY = icpc
CXX = $(COMPILER_$(COMPILER))
CXXFLAGS_GNU = -march=native
CXXFLAGS_INTEL = -march=native
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
ifndef PARTITIONER
define partitioner_help
Set PARTITIONER to select TBB's partitioner.
Partitioner specifies how a loop template should partition its work among threads.
Available options:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
for more details.
endef
$(info $(partitioner_help))
PARTITIONER=AUTO
endif
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

134
TBBStream.cpp Normal file
View File

@ -0,0 +1,134 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "TBBStream.hpp"
template <class T>
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
{
if(device != 0){
throw std::runtime_error("Device != 0 is not supported by TBB");
}
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
}
template <class T>
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
}, partitioner);
}
template <class T>
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
h_a = a;
h_b = b;
h_c = c;
}
template <class T>
void TBBStream<T>::copy()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::mul()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
b[i] = scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::add()
{
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
c[i] = a[i] + b[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::triad()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] = b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
void TBBStream<T>::nstream()
{
const T scalar = startScalar;
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
for (size_t i = r.begin(); i < r.end(); ++i) {
a[i] += b[i] + scalar * c[i];
}
}, partitioner);
}
template <class T>
T TBBStream<T>::dot()
{
// sum += a[i] * b[i];
return
tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
for (size_t i = r.begin(); i < r.end(); ++i) {
acc += a[i] * b[i];
}
return acc;
}, std::plus<T>(), partitioner);
}
void listDevices(void)
{
std::cout << "Listing devices is not supported by TBB" << std::endl;
}
std::string getDeviceName(const int device)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class TBBStream<float>;
template class TBBStream<double>;

62
TBBStream.hpp Normal file
View File

@ -0,0 +1,62 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <vector>
#include "tbb/tbb.h"
#include "Stream.h"
#define IMPLEMENTATION_STRING "TBB"
#if defined(PARTITIONER_AUTO)
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#elif defined(PARTITIONER_AFFINITY)
using tbb_partitioner = tbb::affinity_partitioner;
#define PARTITIONER_NAME "affinity_partitioner"
#elif defined(PARTITIONER_STATIC)
using tbb_partitioner = tbb::static_partitioner;
#define PARTITIONER_NAME "static_partitioner"
#elif defined(PARTITIONER_SIMPLE)
using tbb_partitioner = tbb::simple_partitioner;
#define PARTITIONER_NAME "simple_partitioner"
#else
// default to auto
using tbb_partitioner = tbb::auto_partitioner;
#define PARTITIONER_NAME "auto_partitioner"
#endif
template <class T>
class TBBStream : public Stream<T>
{
protected:
tbb_partitioner partitioner;
tbb::blocked_range<size_t> range;
// Device side pointers
std::vector<T> a;
std::vector<T> b;
std::vector<T> c;
public:
TBBStream(const int, int);
~TBBStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -208,6 +208,20 @@ setup_raja() {
check_size check_size
} }
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
get_and_untar "$tarball" "$url"
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
verify_dir_exists "$TBB_LIB"
check_size
}
setup_clang_gcc() { setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
@ -240,8 +254,7 @@ setup_rocm() {
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq sudo apt-get update -qq
sudo apt-get install -y -qq rocm-dev sudo apt-get install -y -qq rocm-dev
# AMD needs this rocm_path thing exported... export_var ROCM_PATH "/opt/rocm"
export_var ROCM_PATH "/opt/rocm-4.1.0"
export_var HIP_CXX "$ROCM_PATH/bin/hipcc" export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
verify_bin_exists "$HIP_CXX" verify_bin_exists "$HIP_CXX"
"$HIP_CXX" --version "$HIP_CXX" --version
@ -355,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
setup_dpcpp & setup_dpcpp &
setup_kokkos & setup_kokkos &
setup_raja & setup_raja &
setup_tbb &
wait wait
else else
setup_cmake setup_cmake
@ -365,6 +379,7 @@ else
setup_dpcpp setup_dpcpp
setup_kokkos setup_kokkos
setup_raja setup_raja
setup_tbb
# these need apt # these need apt
setup_clang_gcc setup_clang_gcc
setup_rocm setup_rocm

View File

@ -44,21 +44,26 @@ run_build() {
rm -rf "$build" rm -rf "$build"
set +e set +e
local install_dir="$build/install"
# shellcheck disable=SC2086 # shellcheck disable=SC2086
"$CMAKE_BIN" -B"$build" -H. \ "$CMAKE_BIN" -B"$build" -H. \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_INSTALL_PREFIX="$install_dir" \
-DMODEL="$model" $flags &>>"$log" -DMODEL="$model" $flags &>>"$log"
local model_lower=$(echo "$model" | awk '{print tolower($0)}') local model_lower=$(echo "$model" | awk '{print tolower($0)}')
local cmake_code=$? local cmake_code=$?
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
local cmake_code=$? local cmake_code=$?
set -e set -e
local bin="./$build/$model_lower-stream" local bin="./$build/$model_lower-stream"
local installed_bin="./$install_dir/bin/$model_lower-stream"
echo "Checking for final executable: $bin" echo "Checking for final executable: $bin"
if [[ -f "$bin" ]]; then if [[ -f "$bin" ]]; then
echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
@ -66,6 +71,11 @@ run_build() {
cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /'
cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /' cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /'
cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/" cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/"
if [[ ! -f "$installed_bin" ]]; then
echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)"
cat "$log"
exit 1
fi
else else
echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags" echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)" echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
@ -78,30 +88,32 @@ run_build() {
### ###
# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" # KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" # RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
#
# GCC_CXX="/usr/bin/g++" # GCC_CXX="/usr/bin/g++"
# CLANG_CXX="/usr/bin/clang++" # CLANG_CXX="/usr/bin/clang++"
#
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" # NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" # NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" # NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" # NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x # "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
#
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
# AOMP_CXX="/usr/lib/aomp/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++"
# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" # OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
#
# # AMD needs this rocm_path thing exported... # # AMD needs this rocm_path thing exported...
# export ROCM_PATH="/opt/rocm-4.0.0" # export ROCM_PATH="/opt/rocm-4.0.0"
# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" # HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" # COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" # DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
# HIPSYCL_DIR="/opt/hipsycl/cff515c/" # HIPSYCL_DIR="/opt/hipsycl/cff515c/"
#
# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" # ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" # ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
#
# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
# GCC_STD_PAR_LIB="tbb" # GCC_STD_PAR_LIB="tbb"
# CLANG_STD_PAR_LIB="tbb" # CLANG_STD_PAR_LIB="tbb"
# GCC_OMP_OFFLOAD_AMD=false # GCC_OMP_OFFLOAD_AMD=false
@ -129,6 +141,9 @@ build_gcc() {
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
@ -146,11 +161,15 @@ build_gcc() {
run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-DENABLE_CUDA=ON \ # FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
-DTARGET=NVIDIA \ # FIXME we also got https://github.com/NVIDIA/nccl/issues/494
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-DCUDA_ARCH=$NV_ARCH" # run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
# -DENABLE_CUDA=ON \
# -DTARGET=NVIDIA \
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
} }
@ -174,6 +193,10 @@ build_clang() {
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc # no clang /w RAJA+cuda because it needs nvcc which needs gcc
} }

View File

@ -25,6 +25,8 @@
#include "STDStream.h" #include "STDStream.h"
#elif defined(STD20) #elif defined(STD20)
#include "STD20Stream.hpp" #include "STD20Stream.hpp"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -266,6 +268,10 @@ void run()
// Use the C++20 implementation // Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex); stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(TBB)
// Use the C++20 implementation
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);