Merge branch 'main' into rust
This commit is contained in:
commit
c70a5da45b
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,6 +8,7 @@ kokkos-stream
|
|||||||
std-stream
|
std-stream
|
||||||
sycl-stream
|
sycl-stream
|
||||||
hip-stream
|
hip-stream
|
||||||
|
tbb-stream
|
||||||
|
|
||||||
*.o
|
*.o
|
||||||
*.bc
|
*.bc
|
||||||
|
|||||||
@ -13,6 +13,8 @@ All notable changes to this project will be documented in this file.
|
|||||||
- Support for CUDA Managed Memory and Page Fault memory.
|
- Support for CUDA Managed Memory and Page Fault memory.
|
||||||
- Added nstream kernel from PRK with associate command line option.
|
- Added nstream kernel from PRK with associate command line option.
|
||||||
- CMake build system added for all models.
|
- CMake build system added for all models.
|
||||||
|
- SYCL device check for FP64 support.
|
||||||
|
- New implementation using TBB.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Default branch renamed from `master` to `main`.
|
- Default branch renamed from `master` to `main`.
|
||||||
@ -29,6 +31,7 @@ All notable changes to this project will be documented in this file.
|
|||||||
- Unified run function in driver code to reduce code duplication, output should be uneffected.
|
- Unified run function in driver code to reduce code duplication, output should be uneffected.
|
||||||
- Normalise sum result by expected value to help false negative errors.
|
- Normalise sum result by expected value to help false negative errors.
|
||||||
- HC version deprecated and moved to a legacy directory.
|
- HC version deprecated and moved to a legacy directory.
|
||||||
|
- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
|
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
|
||||||
|
|||||||
@ -112,6 +112,7 @@ register_model(SYCL SYCL SYCLStream.cpp)
|
|||||||
register_model(ACC ACC ACCStream.cpp)
|
register_model(ACC ACC ACCStream.cpp)
|
||||||
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
# defining RAJA collides with the RAJA namespace so USE_RAJA
|
||||||
register_model(RAJA USE_RAJA RAJAStream.cpp)
|
register_model(RAJA USE_RAJA RAJAStream.cpp)
|
||||||
|
register_model(TBB TBB TBBStream.cpp)
|
||||||
|
|
||||||
|
|
||||||
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
|
||||||
@ -188,3 +189,5 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
|
|||||||
if (COMMAND setup_target)
|
if (COMMAND setup_target)
|
||||||
setup_target(${EXE_NAME})
|
setup_target(${EXE_NAME})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
install (TARGETS ${EXE_NAME} DESTINATION bin)
|
||||||
@ -5,6 +5,7 @@
|
|||||||
// For full license terms please see the LICENSE file distributed with this
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
// source code
|
// source code
|
||||||
|
|
||||||
|
#include <cstdlib> // For aligned_alloc
|
||||||
#include "OMPStream.h"
|
#include "OMPStream.h"
|
||||||
|
|
||||||
#ifndef ALIGNMENT
|
#ifndef ALIGNMENT
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
// For full license terms please see the LICENSE file distributed with this
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
// source code
|
// source code
|
||||||
|
|
||||||
|
#include <cstdlib> // For aligned_alloc
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include "RAJAStream.hpp"
|
#include "RAJAStream.hpp"
|
||||||
|
|
||||||
|
|||||||
@ -19,6 +19,7 @@ Currently implemented are:
|
|||||||
- Kokkos
|
- Kokkos
|
||||||
- RAJA
|
- RAJA
|
||||||
- SYCL
|
- SYCL
|
||||||
|
- TBB
|
||||||
|
|
||||||
This code was previously called GPU-STREAM.
|
This code was previously called GPU-STREAM.
|
||||||
|
|
||||||
@ -90,7 +91,7 @@ For example:
|
|||||||
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
|
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
|
||||||
-- CXX_EXTRA_LINKER_FLAGS:
|
-- CXX_EXTRA_LINKER_FLAGS:
|
||||||
Append to linker flags (i.e GCC's `-Wl` or equivalent)
|
Append to linker flags (i.e GCC's `-Wl` or equivalent)
|
||||||
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
|
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
|
||||||
-- Selected model : OCL
|
-- Selected model : OCL
|
||||||
-- Supported flags:
|
-- Supported flags:
|
||||||
|
|
||||||
|
|||||||
@ -28,6 +28,14 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
|
|||||||
throw std::runtime_error("Invalid device index");
|
throw std::runtime_error("Invalid device index");
|
||||||
device dev = devices[device_index];
|
device dev = devices[device_index];
|
||||||
|
|
||||||
|
// Check device can support FP64 if needed
|
||||||
|
if (sizeof(T) == sizeof(double))
|
||||||
|
{
|
||||||
|
if (dev.get_info<info::device::double_fp_config>().size() == 0) {
|
||||||
|
throw std::runtime_error("Device does not support double precision, please use --float");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Determine sensible dot kernel NDRange configuration
|
// Determine sensible dot kernel NDRange configuration
|
||||||
if (dev.is_cpu())
|
if (dev.is_cpu())
|
||||||
{
|
{
|
||||||
|
|||||||
29
TBB.cmake
Normal file
29
TBB.cmake
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
|
||||||
|
register_flag_optional(ONE_TBB_DIR
|
||||||
|
"Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/.
|
||||||
|
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
|
||||||
|
"")
|
||||||
|
|
||||||
|
|
||||||
|
register_flag_optional(PARTITIONER
|
||||||
|
"Partitioner specifies how a loop template should partition its work among threads.
|
||||||
|
Possible values are:
|
||||||
|
AUTO - Optimize range subdivision based on work-stealing events.
|
||||||
|
AFFINITY - Proportional splitting that optimizes for cache affinity.
|
||||||
|
STATIC - Distribute work uniformly with no additional load balancing.
|
||||||
|
SIMPLE - Recursively split its range until it cannot be further subdivided.
|
||||||
|
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
|
||||||
|
"AUTO")
|
||||||
|
|
||||||
|
macro(setup)
|
||||||
|
if(ONE_TBB_DIR)
|
||||||
|
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
|
||||||
|
# docs on Intel's website refers to TBB_DIR which is not correct
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
|
||||||
|
find_package(TBB REQUIRED)
|
||||||
|
register_link_library(TBB::tbb)
|
||||||
|
register_definitions(PARTITIONER_${PARTITIONER})
|
||||||
|
endmacro()
|
||||||
56
TBB.make
Normal file
56
TBB.make
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
GNU INTEL INTEL_LEGACY
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
CXX_GNU = g++
|
||||||
|
CXX_INTEL = icpx
|
||||||
|
CXX_INTEL_LEGACY = icpc
|
||||||
|
CXX = $(COMPILER_$(COMPILER))
|
||||||
|
|
||||||
|
CXXFLAGS_GNU = -march=native
|
||||||
|
CXXFLAGS_INTEL = -march=native
|
||||||
|
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
|
||||||
|
|
||||||
|
CXX = $(CXX_$(COMPILER))
|
||||||
|
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ifndef PARTITIONER
|
||||||
|
define partitioner_help
|
||||||
|
Set PARTITIONER to select TBB's partitioner.
|
||||||
|
Partitioner specifies how a loop template should partition its work among threads.
|
||||||
|
|
||||||
|
Available options:
|
||||||
|
AUTO - Optimize range subdivision based on work-stealing events.
|
||||||
|
AFFINITY - Proportional splitting that optimizes for cache affinity.
|
||||||
|
STATIC - Distribute work uniformly with no additional load balancing.
|
||||||
|
SIMPLE - Recursively split its range until it cannot be further subdivided.
|
||||||
|
|
||||||
|
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(partitioner_help))
|
||||||
|
PARTITIONER=AUTO
|
||||||
|
endif
|
||||||
|
|
||||||
|
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
|
||||||
|
|
||||||
|
|
||||||
|
tbb-stream: main.cpp TBBStream.cpp
|
||||||
|
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f tbb-stream
|
||||||
|
|
||||||
134
TBBStream.cpp
Normal file
134
TBBStream.cpp
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
// Copyright (c) 2020 Tom Deakin
|
||||||
|
// University of Bristol HPC
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#include "TBBStream.hpp"
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
|
||||||
|
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
if(device != 0){
|
||||||
|
throw std::runtime_error("Device != 0 is not supported by TBB");
|
||||||
|
}
|
||||||
|
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
|
||||||
|
{
|
||||||
|
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
a[i] = initA;
|
||||||
|
b[i] = initB;
|
||||||
|
c[i] = initC;
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||||
|
{
|
||||||
|
// Element-wise copy.
|
||||||
|
h_a = a;
|
||||||
|
h_b = b;
|
||||||
|
h_c = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::copy()
|
||||||
|
{
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
c[i] = a[i];
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::mul()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
b[i] = scalar * c[i];
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::add()
|
||||||
|
{
|
||||||
|
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
c[i] = a[i] + b[i];
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::triad()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
a[i] = b[i] + scalar * c[i];
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void TBBStream<T>::nstream()
|
||||||
|
{
|
||||||
|
const T scalar = startScalar;
|
||||||
|
|
||||||
|
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
a[i] += b[i] + scalar * c[i];
|
||||||
|
}
|
||||||
|
}, partitioner);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
T TBBStream<T>::dot()
|
||||||
|
{
|
||||||
|
// sum += a[i] * b[i];
|
||||||
|
return
|
||||||
|
tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
|
||||||
|
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||||
|
acc += a[i] * b[i];
|
||||||
|
}
|
||||||
|
return acc;
|
||||||
|
}, std::plus<T>(), partitioner);
|
||||||
|
}
|
||||||
|
|
||||||
|
void listDevices(void)
|
||||||
|
{
|
||||||
|
std::cout << "Listing devices is not supported by TBB" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceName(const int device)
|
||||||
|
{
|
||||||
|
return std::string("Device name unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceDriver(const int)
|
||||||
|
{
|
||||||
|
return std::string("Device driver unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
template class TBBStream<float>;
|
||||||
|
template class TBBStream<double>;
|
||||||
|
|
||||||
62
TBBStream.hpp
Normal file
62
TBBStream.hpp
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
// Copyright (c) 2020 Tom Deakin
|
||||||
|
// University of Bristol HPC
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include "tbb/tbb.h"
|
||||||
|
#include "Stream.h"
|
||||||
|
|
||||||
|
#define IMPLEMENTATION_STRING "TBB"
|
||||||
|
|
||||||
|
#if defined(PARTITIONER_AUTO)
|
||||||
|
using tbb_partitioner = tbb::auto_partitioner;
|
||||||
|
#define PARTITIONER_NAME "auto_partitioner"
|
||||||
|
#elif defined(PARTITIONER_AFFINITY)
|
||||||
|
using tbb_partitioner = tbb::affinity_partitioner;
|
||||||
|
#define PARTITIONER_NAME "affinity_partitioner"
|
||||||
|
#elif defined(PARTITIONER_STATIC)
|
||||||
|
using tbb_partitioner = tbb::static_partitioner;
|
||||||
|
#define PARTITIONER_NAME "static_partitioner"
|
||||||
|
#elif defined(PARTITIONER_SIMPLE)
|
||||||
|
using tbb_partitioner = tbb::simple_partitioner;
|
||||||
|
#define PARTITIONER_NAME "simple_partitioner"
|
||||||
|
#else
|
||||||
|
// default to auto
|
||||||
|
using tbb_partitioner = tbb::auto_partitioner;
|
||||||
|
#define PARTITIONER_NAME "auto_partitioner"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
class TBBStream : public Stream<T>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
|
||||||
|
tbb_partitioner partitioner;
|
||||||
|
tbb::blocked_range<size_t> range;
|
||||||
|
// Device side pointers
|
||||||
|
std::vector<T> a;
|
||||||
|
std::vector<T> b;
|
||||||
|
std::vector<T> c;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TBBStream(const int, int);
|
||||||
|
~TBBStream() = default;
|
||||||
|
|
||||||
|
virtual void copy() override;
|
||||||
|
virtual void add() override;
|
||||||
|
virtual void mul() override;
|
||||||
|
virtual void triad() override;
|
||||||
|
virtual void nstream() override;
|
||||||
|
virtual T dot() override;
|
||||||
|
|
||||||
|
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||||
|
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
@ -208,6 +208,20 @@ setup_raja() {
|
|||||||
check_size
|
check_size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setup_tbb() {
|
||||||
|
echo "Preparing TBB"
|
||||||
|
local tbb_ver="2021.2.0"
|
||||||
|
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
|
||||||
|
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
# local url="http://localhost:8000/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||||
|
|
||||||
|
get_and_untar "$tarball" "$url"
|
||||||
|
export_var TBB_LIB "$PWD/oneapi-tbb-$tbb_ver"
|
||||||
|
verify_dir_exists "$TBB_LIB"
|
||||||
|
check_size
|
||||||
|
}
|
||||||
|
|
||||||
setup_clang_gcc() {
|
setup_clang_gcc() {
|
||||||
|
|
||||||
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
||||||
@ -240,8 +254,7 @@ setup_rocm() {
|
|||||||
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
||||||
sudo apt-get update -qq
|
sudo apt-get update -qq
|
||||||
sudo apt-get install -y -qq rocm-dev
|
sudo apt-get install -y -qq rocm-dev
|
||||||
# AMD needs this rocm_path thing exported...
|
export_var ROCM_PATH "/opt/rocm"
|
||||||
export_var ROCM_PATH "/opt/rocm-4.1.0"
|
|
||||||
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
|
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
|
||||||
verify_bin_exists "$HIP_CXX"
|
verify_bin_exists "$HIP_CXX"
|
||||||
"$HIP_CXX" --version
|
"$HIP_CXX" --version
|
||||||
@ -355,6 +368,7 @@ if [ "$PARALLEL" = true ]; then
|
|||||||
setup_dpcpp &
|
setup_dpcpp &
|
||||||
setup_kokkos &
|
setup_kokkos &
|
||||||
setup_raja &
|
setup_raja &
|
||||||
|
setup_tbb &
|
||||||
wait
|
wait
|
||||||
else
|
else
|
||||||
setup_cmake
|
setup_cmake
|
||||||
@ -365,6 +379,7 @@ else
|
|||||||
setup_dpcpp
|
setup_dpcpp
|
||||||
setup_kokkos
|
setup_kokkos
|
||||||
setup_raja
|
setup_raja
|
||||||
|
setup_tbb
|
||||||
# these need apt
|
# these need apt
|
||||||
setup_clang_gcc
|
setup_clang_gcc
|
||||||
setup_rocm
|
setup_rocm
|
||||||
|
|||||||
@ -44,21 +44,26 @@ run_build() {
|
|||||||
|
|
||||||
rm -rf "$build"
|
rm -rf "$build"
|
||||||
set +e
|
set +e
|
||||||
|
local install_dir="$build/install"
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
"$CMAKE_BIN" -B"$build" -H. \
|
"$CMAKE_BIN" -B"$build" -H. \
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$install_dir" \
|
||||||
-DMODEL="$model" $flags &>>"$log"
|
-DMODEL="$model" $flags &>>"$log"
|
||||||
local model_lower=$(echo "$model" | awk '{print tolower($0)}')
|
local model_lower=$(echo "$model" | awk '{print tolower($0)}')
|
||||||
|
|
||||||
local cmake_code=$?
|
local cmake_code=$?
|
||||||
|
|
||||||
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
|
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
|
||||||
|
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
|
||||||
local cmake_code=$?
|
local cmake_code=$?
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
local bin="./$build/$model_lower-stream"
|
local bin="./$build/$model_lower-stream"
|
||||||
|
local installed_bin="./$install_dir/bin/$model_lower-stream"
|
||||||
|
|
||||||
echo "Checking for final executable: $bin"
|
echo "Checking for final executable: $bin"
|
||||||
if [[ -f "$bin" ]]; then
|
if [[ -f "$bin" ]]; then
|
||||||
echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
|
echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
|
||||||
@ -66,6 +71,11 @@ run_build() {
|
|||||||
cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /'
|
cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/ /'
|
||||||
cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /'
|
cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/ /'
|
||||||
cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/"
|
cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/ $(tput setaf 3)&$(tput sgr0)/"
|
||||||
|
if [[ ! -f "$installed_bin" ]]; then
|
||||||
|
echo "$(tput setaf 1)[ERR!] looking for $installed_bin from --target install but it's not there!$(tput sgr0)"
|
||||||
|
cat "$log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
|
echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
|
||||||
echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
|
echo " $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
|
||||||
@ -76,38 +86,40 @@ run_build() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
###
|
###
|
||||||
#KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
|
# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
|
||||||
#RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
|
# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
|
||||||
#
|
|
||||||
#GCC_CXX="/usr/bin/g++"
|
# GCC_CXX="/usr/bin/g++"
|
||||||
#CLANG_CXX="/usr/bin/clang++"
|
# CLANG_CXX="/usr/bin/clang++"
|
||||||
#
|
|
||||||
#NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
|
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
|
||||||
#NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
|
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
|
||||||
#NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
|
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
|
||||||
#NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
|
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
|
||||||
#"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
|
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
|
||||||
#
|
|
||||||
#AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
|
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
|
||||||
#AOMP_CXX="/usr/lib/aomp/bin/clang++"
|
# AOMP_CXX="/usr/lib/aomp/bin/clang++"
|
||||||
#OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
|
# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
|
||||||
#
|
|
||||||
## AMD needs this rocm_path thing exported...
|
# # AMD needs this rocm_path thing exported...
|
||||||
#export ROCM_PATH="/opt/rocm-4.0.0"
|
# export ROCM_PATH="/opt/rocm-4.0.0"
|
||||||
#HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
|
# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
|
||||||
#COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
|
# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
|
||||||
#DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
|
# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
|
||||||
#HIPSYCL_DIR="/opt/hipsycl/cff515c/"
|
# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
|
||||||
#
|
|
||||||
#ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
|
# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
|
||||||
#ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
|
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
|
||||||
#
|
|
||||||
#GCC_STD_PAR_LIB="tbb"
|
# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
|
||||||
#CLANG_STD_PAR_LIB="tbb"
|
|
||||||
#GCC_OMP_OFFLOAD_AMD=false
|
# GCC_STD_PAR_LIB="tbb"
|
||||||
#GCC_OMP_OFFLOAD_NVIDIA=true
|
# CLANG_STD_PAR_LIB="tbb"
|
||||||
#CLANG_OMP_OFFLOAD_AMD=false
|
# GCC_OMP_OFFLOAD_AMD=false
|
||||||
#CLANG_OMP_OFFLOAD_NVIDIA=false
|
# GCC_OMP_OFFLOAD_NVIDIA=true
|
||||||
|
# CLANG_OMP_OFFLOAD_AMD=false
|
||||||
|
# CLANG_OMP_OFFLOAD_NVIDIA=false
|
||||||
###
|
###
|
||||||
|
|
||||||
AMD_ARCH="gfx_903"
|
AMD_ARCH="gfx_903"
|
||||||
@ -129,6 +141,9 @@ build_gcc() {
|
|||||||
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||||
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
|
||||||
|
|
||||||
|
run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||||
|
run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
|
||||||
|
|
||||||
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
||||||
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
||||||
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
||||||
@ -146,11 +161,15 @@ build_gcc() {
|
|||||||
run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||||
run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||||
run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||||
run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
|
||||||
-DENABLE_CUDA=ON \
|
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
|
||||||
-DTARGET=NVIDIA \
|
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
|
||||||
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
|
||||||
-DCUDA_ARCH=$NV_ARCH"
|
# run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
||||||
|
# -DENABLE_CUDA=ON \
|
||||||
|
# -DTARGET=NVIDIA \
|
||||||
|
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||||
|
# -DCUDA_ARCH=$NV_ARCH"
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -174,6 +193,10 @@ build_clang() {
|
|||||||
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||||
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
|
||||||
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
|
||||||
|
|
||||||
|
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||||
|
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
|
||||||
|
|
||||||
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||||
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
||||||
}
|
}
|
||||||
|
|||||||
6
main.cpp
6
main.cpp
@ -25,6 +25,8 @@
|
|||||||
#include "STDStream.h"
|
#include "STDStream.h"
|
||||||
#elif defined(STD20)
|
#elif defined(STD20)
|
||||||
#include "STD20Stream.hpp"
|
#include "STD20Stream.hpp"
|
||||||
|
#elif defined(TBB)
|
||||||
|
#include "TBBStream.hpp"
|
||||||
#elif defined(HIP)
|
#elif defined(HIP)
|
||||||
#include "HIPStream.h"
|
#include "HIPStream.h"
|
||||||
#elif defined(HC)
|
#elif defined(HC)
|
||||||
@ -266,6 +268,10 @@ void run()
|
|||||||
// Use the C++20 implementation
|
// Use the C++20 implementation
|
||||||
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|
||||||
|
#elif defined(TBB)
|
||||||
|
// Use the C++20 implementation
|
||||||
|
stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|
||||||
#elif defined(ACC)
|
#elif defined(ACC)
|
||||||
// Use the OpenACC implementation
|
// Use the OpenACC implementation
|
||||||
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user