parent
3dcafd1af1
commit
72be9f6980
84
.github/workflows/main.yaml
vendored
84
.github/workflows/main.yaml
vendored
@ -12,7 +12,7 @@ on:
|
||||
jobs:
|
||||
|
||||
test-rust:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/rust/rust-stream
|
||||
@ -28,7 +28,7 @@ jobs:
|
||||
run: ./target/release/rust-stream --arraysize 2048
|
||||
|
||||
test-java:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/java/java-stream
|
||||
@ -41,7 +41,7 @@ jobs:
|
||||
run: java -jar target/java-stream.jar --arraysize 2048
|
||||
|
||||
test-julia:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./src/julia/JuliaStream.jl
|
||||
@ -69,8 +69,24 @@ jobs:
|
||||
run: julia --project src/AMDGPUStream.jl --list
|
||||
|
||||
|
||||
setup-cpp:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Cache compiler
|
||||
# if: ${{ !env.ACT }}
|
||||
id: prepare-compilers
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ./compilers
|
||||
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
|
||||
|
||||
- name: Prepare compilers
|
||||
if: steps.prepare-compilers.outputs.cache-hit != 'true'
|
||||
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
|
||||
|
||||
test-cpp:
|
||||
runs-on: ubuntu-18.04
|
||||
needs: setup-cpp
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
@ -84,15 +100,15 @@ jobs:
|
||||
|
||||
- name: Prepare compilers
|
||||
if: steps.prepare-compilers.outputs.cache-hit != 'true'
|
||||
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true
|
||||
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
|
||||
|
||||
- name: Setup test environment
|
||||
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
|
||||
|
||||
# Enable tmate debugging of manually-triggered workflows if the input option was provided
|
||||
- name: Setup tmate session
|
||||
uses: mxschmitt/action-tmate@v3
|
||||
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
|
||||
# - name: Setup tmate session
|
||||
# uses: mxschmitt/action-tmate@v3
|
||||
# if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.13
|
||||
if: ${{ ! cancelled() }}
|
||||
@ -167,4 +183,54 @@ jobs:
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.18
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile clang @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile nvhpc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile aocc @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile aomp @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile hip @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile dpcpp @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.20
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
|
||||
|
||||
- name: Test compile gcc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile clang @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile nvhpc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile aocc @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile aomp @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile hip @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile dpcpp @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
|
||||
- name: Test compile hipsycl @ CMake 3.24
|
||||
if: ${{ ! cancelled() }}
|
||||
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}
|
||||
@ -1,5 +1,9 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif ()
|
||||
|
||||
project(BabelStream VERSION 4.0 LANGUAGES CXX)
|
||||
|
||||
# uncomment for debugging build issues:
|
||||
@ -71,15 +75,19 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
|
||||
# Honor user's CXX_EXTRA_LINK_FLAGS
|
||||
set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
|
||||
|
||||
option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that
|
||||
option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
|
||||
don't explicitly link against TBB is a no-op, see description of your selected
|
||||
model on how this is used." OFF)
|
||||
|
||||
if (USE_TBB)
|
||||
option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
|
||||
FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
|
||||
set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
|
||||
|
||||
if (FETCH_TBB)
|
||||
FetchContent_Declare(
|
||||
TBB
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
|
||||
GIT_TAG v2021.9.0
|
||||
GIT_TAG "${FETCH_TBB_VERSION}"
|
||||
)
|
||||
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
@ -92,15 +100,19 @@ if (USE_TBB)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that
|
||||
option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
|
||||
don't explicitly link against DPL is a no-op, see description of your selected
|
||||
model on how this is used." OFF)
|
||||
|
||||
if (USE_ONEDPL)
|
||||
option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
|
||||
FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
|
||||
set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
|
||||
|
||||
if (FETCH_ONEDPL)
|
||||
FetchContent_Declare(
|
||||
oneDPL
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
|
||||
GIT_TAG oneDPL-2022.2.0-rc1
|
||||
GIT_TAG "${FETCH_ONEDPL_VERSION}"
|
||||
)
|
||||
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
|
||||
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
|
||||
|
||||
@ -83,6 +83,8 @@ get() {
|
||||
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
|
||||
echo "$name not found, downloading..."
|
||||
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
|
||||
else
|
||||
echo "$name found, skipping download..."
|
||||
fi
|
||||
fi
|
||||
}
|
||||
@ -92,13 +94,15 @@ get_and_untar() {
|
||||
local pkg_url="$2"
|
||||
if [ "$SETUP" = true ]; then
|
||||
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
|
||||
echo "$name not found, downloading..."
|
||||
echo "$name not found, downloading ($pkg_url)..."
|
||||
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
|
||||
fi
|
||||
echo "Preparing to extract $name ..."
|
||||
tar -xf "$name"
|
||||
echo "$name extracted, deleting archive ..."
|
||||
rm -f "$name" # delete for space
|
||||
else
|
||||
echo "Skipping setup for $name ($pkg_url)..."
|
||||
fi
|
||||
}
|
||||
|
||||
@ -119,10 +123,10 @@ verify_dir_exists() {
|
||||
setup_aocc() {
|
||||
echo "Preparing AOCC"
|
||||
|
||||
local aocc_ver="2.3.0"
|
||||
local aocc_ver="4.0.0"
|
||||
local tarball="aocc-$aocc_ver.tar.xz"
|
||||
# XXX it's actually XZ compressed, so it should be tar.xz
|
||||
local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
|
||||
local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
|
||||
# local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
|
||||
|
||||
get_and_untar "$tarball" "$AOCC_URL"
|
||||
@ -133,10 +137,10 @@ setup_aocc() {
|
||||
}
|
||||
|
||||
setup_nvhpc() {
|
||||
echo "Preparing Nvidia HPC SDK"
|
||||
local nvhpc_ver="22.3"
|
||||
local nvhpc_release="2022_223"
|
||||
local cuda_ver="11.6"
|
||||
echo "Preparing Nvidia HPC SDK"
|
||||
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
|
||||
local nvhpc_release="2023_231"
|
||||
local cuda_ver="12.0"
|
||||
|
||||
local tarball="nvhpc_$nvhpc_ver.tar.gz"
|
||||
|
||||
@ -145,7 +149,7 @@ setup_nvhpc() {
|
||||
|
||||
local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
|
||||
local bin_dir="$sdk_dir/compilers/bin"
|
||||
"$bin_dir/makelocalrc" "$bin_dir" -x
|
||||
"$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
|
||||
|
||||
export_var NVHPC_SDK_DIR "$sdk_dir"
|
||||
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
|
||||
@ -166,7 +170,8 @@ setup_nvhpc() {
|
||||
|
||||
setup_aomp() {
|
||||
echo "Preparing AOMP"
|
||||
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
|
||||
local aomp_ver="18.0-0"
|
||||
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
|
||||
# local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
|
||||
get_and_install_deb "aomp" "aomp" "$AOMP_URL"
|
||||
|
||||
@ -189,9 +194,10 @@ setup_oclcpu() {
|
||||
|
||||
setup_kokkos() {
|
||||
echo "Preparing Kokkos"
|
||||
local kokkos_ver="3.3.01"
|
||||
local kokkos_ver="4.1.00"
|
||||
local tarball="kokkos-$kokkos_ver.tar.gz"
|
||||
|
||||
|
||||
local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
|
||||
# local url="http://localhost:8000/$kokkos_ver.tar.gz"
|
||||
|
||||
@ -203,10 +209,10 @@ setup_kokkos() {
|
||||
|
||||
setup_raja() {
|
||||
echo "Preparing RAJA"
|
||||
local raja_ver="0.13.0"
|
||||
local raja_ver="2023.06.1"
|
||||
local tarball="raja-$raja_ver.tar.gz"
|
||||
|
||||
local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
|
||||
local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
|
||||
# local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
|
||||
|
||||
get_and_untar "$tarball" "$url"
|
||||
@ -217,7 +223,7 @@ setup_raja() {
|
||||
|
||||
setup_tbb() {
|
||||
echo "Preparing TBB"
|
||||
local tbb_ver="2021.2.0"
|
||||
local tbb_ver="2021.9.0"
|
||||
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
|
||||
|
||||
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
|
||||
@ -231,9 +237,9 @@ setup_tbb() {
|
||||
|
||||
setup_clang_gcc() {
|
||||
|
||||
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
|
||||
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
|
||||
|
||||
export_var GCC_CXX "$(which g++-10)"
|
||||
export_var GCC_CXX "$(which g++-12)"
|
||||
verify_bin_exists "$GCC_CXX"
|
||||
"$GCC_CXX" --version
|
||||
|
||||
@ -254,7 +260,7 @@ setup_clang_gcc() {
|
||||
}
|
||||
|
||||
setup_rocm() {
|
||||
sudo apt-get install -y -qq rocm-dev rocthrust-dev
|
||||
sudo apt-get install -y rocm-dev rocthrust-dev
|
||||
export_var ROCM_PATH "/opt/rocm"
|
||||
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
|
||||
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
|
||||
@ -265,7 +271,7 @@ setup_rocm() {
|
||||
|
||||
setup_dpcpp() {
|
||||
|
||||
local nightly="20210106"
|
||||
local nightly="20230615"
|
||||
local tarball="dpcpp-$nightly.tar.gz"
|
||||
|
||||
local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
|
||||
@ -282,22 +288,22 @@ setup_dpcpp() {
|
||||
setup_hipsycl() {
|
||||
|
||||
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
|
||||
local hipsycl_ver="0.9.0"
|
||||
local hipsycl_ver="0.9.1"
|
||||
local tarball="v$hipsycl_ver.tar.gz"
|
||||
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
|
||||
|
||||
local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
|
||||
# local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
|
||||
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
|
||||
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
|
||||
|
||||
get_and_untar "$tarball" "$url"
|
||||
|
||||
if [ "$SETUP" = true ]; then
|
||||
local src="$PWD/hipSYCL-$hipsycl_ver"
|
||||
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
|
||||
rm -rf "$src/build"
|
||||
rm -rf "$install_dir"
|
||||
cmake "-B$src/build" "-H$src" \
|
||||
-DCMAKE_C_COMPILER="$(which gcc-10)" \
|
||||
-DCMAKE_CXX_COMPILER="$(which g++-10)" \
|
||||
-DCMAKE_C_COMPILER="$(which gcc-12)" \
|
||||
-DCMAKE_CXX_COMPILER="$(which g++-12)" \
|
||||
-DCMAKE_INSTALL_PREFIX="$install_dir" \
|
||||
-DWITH_ROCM_BACKEND=OFF \
|
||||
-DWITH_CUDA_BACKEND=OFF \
|
||||
@ -312,25 +318,20 @@ setup_hipsycl() {
|
||||
check_size
|
||||
}
|
||||
|
||||
setup_computecpp() {
|
||||
echo "TODO ComputeCpp requires registration+login to download"
|
||||
}
|
||||
|
||||
if [ "${GITHUB_ACTIONS:-false}" = true ]; then
|
||||
echo "Running in GitHub Actions, defaulting to special export"
|
||||
TERM=xterm
|
||||
export TERM=xterm
|
||||
|
||||
# drop the lock in case we got one from a failed run
|
||||
rm /var/lib/dpkg/lock-frontend || true
|
||||
rm /var/cache/apt/archives/lock || true
|
||||
|
||||
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
|
||||
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
|
||||
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
rm -rf /var/lib/dpkg/lock-frontend || true
|
||||
rm -rf /var/cache/apt/archives/lock || true
|
||||
|
||||
mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y -qq cmake
|
||||
sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
|
||||
|
||||
if [ "$SETUP" = true ]; then
|
||||
echo "Deleting extra packages for space in 2 seconds..."
|
||||
@ -340,6 +341,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
|
||||
sudo apt-get autoremove -y
|
||||
check_size
|
||||
fi
|
||||
sudo apt-get upgrade -qq
|
||||
else
|
||||
echo "Running locally, defaulting to standard export"
|
||||
fi
|
||||
@ -368,6 +370,18 @@ setup_cmake() {
|
||||
verify_bin_exists "$CMAKE_3_18_BIN"
|
||||
"$CMAKE_3_18_BIN" --version
|
||||
|
||||
get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
|
||||
chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
|
||||
export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
|
||||
verify_bin_exists "$CMAKE_3_20_BIN"
|
||||
"$CMAKE_3_20_BIN" --version
|
||||
|
||||
get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
|
||||
chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
|
||||
export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
|
||||
verify_bin_exists "$CMAKE_3_24_BIN"
|
||||
"$CMAKE_3_24_BIN" --version
|
||||
|
||||
check_size
|
||||
|
||||
}
|
||||
@ -385,6 +399,10 @@ if [ "$PARALLEL" = true ]; then
|
||||
setup_tbb &
|
||||
wait
|
||||
else
|
||||
# these need apt
|
||||
setup_clang_gcc
|
||||
setup_rocm
|
||||
setup_hipsycl
|
||||
setup_cmake
|
||||
setup_aocc
|
||||
setup_oclcpu
|
||||
@ -394,10 +412,6 @@ else
|
||||
setup_kokkos
|
||||
setup_raja
|
||||
setup_tbb
|
||||
# these need apt
|
||||
setup_clang_gcc
|
||||
setup_rocm
|
||||
setup_hipsycl
|
||||
fi
|
||||
|
||||
echo "Done!"
|
||||
|
||||
@ -120,10 +120,21 @@ run_build() {
|
||||
# CLANG_OMP_OFFLOAD_NVIDIA=false
|
||||
###
|
||||
|
||||
NV_ARCH_CC="70"
|
||||
AMD_ARCH="gfx_903"
|
||||
NV_ARCH="sm_70"
|
||||
NV_ARCH="sm_${NV_ARCH_CC}"
|
||||
NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
|
||||
|
||||
check_cmake_ver(){
|
||||
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
||||
local required=$1
|
||||
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
build_gcc() {
|
||||
local name="gcc_build"
|
||||
local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
|
||||
@ -138,14 +149,12 @@ build_gcc() {
|
||||
for use_onedpl in OFF OPENMP TBB; do
|
||||
case "$use_onedpl" in
|
||||
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;;
|
||||
*) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
esac
|
||||
for use_vector in OFF ON; do
|
||||
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
||||
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
|
||||
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
|
||||
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
|
||||
done
|
||||
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
|
||||
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
done
|
||||
|
||||
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||
@ -153,40 +162,45 @@ build_gcc() {
|
||||
run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
|
||||
|
||||
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
|
||||
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
|
||||
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
|
||||
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
|
||||
fi
|
||||
|
||||
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
|
||||
fi
|
||||
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
|
||||
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
|
||||
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
|
||||
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
||||
-DENABLE_CUDA=ON \
|
||||
-DTARGET=NVIDIA \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||
-DCUDA_ARCH=$NV_ARCH"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
|
||||
# -DENABLE_CUDA=ON \
|
||||
# -DTARGET=NVIDIA \
|
||||
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
|
||||
# -DCUDA_ARCH=$NV_ARCH"
|
||||
|
||||
|
||||
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
|
||||
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
|
||||
local required="3.15.0"
|
||||
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||
if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
|
||||
# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
|
||||
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
|
||||
|
||||
# FIXME CUDA Thrust + TBB throws the following error:
|
||||
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
|
||||
@ -198,7 +212,7 @@ build_gcc() {
|
||||
|
||||
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
|
||||
else
|
||||
echo "CMake version ${current} < ${required}, skipping Thrust models"
|
||||
echo "Skipping Thrust models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
}
|
||||
@ -216,30 +230,39 @@ build_clang() {
|
||||
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
|
||||
fi
|
||||
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
|
||||
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
|
||||
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
|
||||
for use_onedpl in OFF OPENMP TBB; do
|
||||
for use_vector in OFF ON; do
|
||||
case "$use_onedpl" in
|
||||
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
|
||||
*) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
esac
|
||||
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector "
|
||||
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
|
||||
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported
|
||||
done
|
||||
case "$use_onedpl" in
|
||||
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
|
||||
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
|
||||
esac
|
||||
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
|
||||
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
|
||||
done
|
||||
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
|
||||
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
|
||||
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
|
||||
}
|
||||
|
||||
@ -249,10 +272,6 @@ build_nvhpc() {
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
|
||||
|
||||
# std again but with vectors
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
|
||||
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
|
||||
|
||||
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
|
||||
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
|
||||
}
|
||||
@ -291,15 +310,18 @@ build_icpc() {
|
||||
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
|
||||
run_build $name "${ICPC_CXX:?}" omp "$cxx"
|
||||
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
|
||||
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
}
|
||||
if check_cmake_ver "3.20.0"; then
|
||||
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping RAJA models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
if check_cmake_ver "3.16.0"; then
|
||||
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
|
||||
else
|
||||
echo "Skipping Kokkos models due to CMake version requirement"
|
||||
fi
|
||||
|
||||
build_computecpp() {
|
||||
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
|
||||
-DSYCL_COMPILER=COMPUTECPP \
|
||||
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
|
||||
-DOpenCL_LIBRARY=${OCL_LIB:?}"
|
||||
}
|
||||
|
||||
build_dpcpp() {
|
||||
|
||||
@ -8,8 +8,6 @@ register_flag_optional(RAJA_IN_TREE
|
||||
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
|
||||
Remember to append RAJA specific flags as well, for example:
|
||||
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
|
||||
For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options:
|
||||
-DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ...
|
||||
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
|
||||
" "")
|
||||
|
||||
|
||||
@ -6,22 +6,10 @@
|
||||
|
||||
#include "STDDataStream.h"
|
||||
|
||||
#ifdef USE_VECTOR
|
||||
#define BEGIN(x) (x).begin()
|
||||
#define END(x) (x).end()
|
||||
#else
|
||||
#define BEGIN(x) (x)
|
||||
#define END(x) ((x) + array_size)
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
|
||||
noexcept : array_size{ARRAY_SIZE},
|
||||
#ifdef USE_VECTOR
|
||||
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||
#else
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
#endif
|
||||
{
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
@ -41,55 +29,53 @@ STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
|
||||
|
||||
template<class T>
|
||||
STDDataStream<T>::~STDDataStream() {
|
||||
#ifndef USE_VECTOR
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
#endif
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::fill(exe_policy, BEGIN(a), END(a), initA);
|
||||
std::fill(exe_policy, BEGIN(b), END(b), initB);
|
||||
std::fill(exe_policy, BEGIN(c), END(c), initC);
|
||||
std::fill(exe_policy, a, a + array_size, initA);
|
||||
std::fill(exe_policy, b, b + array_size, initB);
|
||||
std::fill(exe_policy, c, c + array_size, initC);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
std::copy(BEGIN(a), END(a), h_a.begin());
|
||||
std::copy(BEGIN(b), END(b), h_b.begin());
|
||||
std::copy(BEGIN(c), END(c), h_c.begin());
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::copy()
|
||||
{
|
||||
// c[i] = a[i]
|
||||
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
|
||||
std::copy(exe_policy, a, a + array_size, c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::mul()
|
||||
{
|
||||
// b[i] = scalar * c[i];
|
||||
std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
|
||||
std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::add()
|
||||
{
|
||||
// c[i] = a[i] + b[i];
|
||||
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
|
||||
std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDDataStream<T>::triad()
|
||||
{
|
||||
// a[i] = b[i] + scalar * c[i];
|
||||
std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
|
||||
std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
@ -99,8 +85,8 @@ void STDDataStream<T>::nstream()
|
||||
// Need to do in two stages with C++11 STL.
|
||||
// 1: a[i] += b[i]
|
||||
// 2: a[i] += scalar * c[i];
|
||||
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
|
||||
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
|
||||
std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
|
||||
std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
|
||||
}
|
||||
|
||||
|
||||
@ -108,7 +94,7 @@ template <class T>
|
||||
T STDDataStream<T>::dot()
|
||||
{
|
||||
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
|
||||
return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -127,6 +113,3 @@ std::string getDeviceDriver(const int)
|
||||
}
|
||||
template class STDDataStream<float>;
|
||||
template class STDDataStream<double>;
|
||||
|
||||
#undef BEGIN
|
||||
#undef END
|
||||
|
||||
@ -22,12 +22,7 @@ class STDDataStream : public Stream<T>
|
||||
int array_size;
|
||||
|
||||
// Device side pointers
|
||||
#ifdef USE_VECTOR
|
||||
std::vector<T> a, b, c;
|
||||
#else
|
||||
T *a, *b, *c;
|
||||
#endif
|
||||
|
||||
|
||||
public:
|
||||
STDDataStream(const int, int) noexcept;
|
||||
|
||||
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection"
|
||||
"c++")
|
||||
|
||||
register_flag_optional(USE_VECTOR
|
||||
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
|
||||
"OFF")
|
||||
|
||||
register_flag_optional(NVHPC_OFFLOAD
|
||||
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
|
||||
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
|
||||
@ -47,9 +43,6 @@ macro(setup)
|
||||
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
|
||||
register_append_link_flags(${NVHPC_FLAGS})
|
||||
endif ()
|
||||
if (USE_VECTOR)
|
||||
register_definitions(USE_VECTOR)
|
||||
endif ()
|
||||
if (USE_TBB)
|
||||
register_link_library(TBB::tbb)
|
||||
endif ()
|
||||
|
||||
@ -10,32 +10,10 @@
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
#ifdef USE_VECTOR
|
||||
#define BEGIN(x) (x).begin()
|
||||
#define END(x) (x).end()
|
||||
#else
|
||||
#define BEGIN(x) (x)
|
||||
#define END(x) ((x) + array_size)
|
||||
#endif
|
||||
|
||||
#ifdef USE_VECTOR
|
||||
#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__))
|
||||
#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures."
|
||||
#endif
|
||||
|
||||
#if defined(USE_ONEDPL)
|
||||
#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
|
||||
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
|
||||
#ifdef USE_VECTOR
|
||||
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||
#else
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
#endif
|
||||
{
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
@ -55,41 +33,39 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
|
||||
|
||||
template<class T>
|
||||
STDIndicesStream<T>::~STDIndicesStream() {
|
||||
#ifndef USE_VECTOR
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
#endif
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::fill(exe_policy, BEGIN(a), END(a), initA);
|
||||
std::fill(exe_policy, BEGIN(b), END(b), initB);
|
||||
std::fill(exe_policy, BEGIN(c), END(c), initC);
|
||||
std::fill(exe_policy, a, a + array_size, initA);
|
||||
std::fill(exe_policy, b, b + array_size, initB);
|
||||
std::fill(exe_policy, c, c + array_size, initC);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
std::copy(BEGIN(a), END(a), h_a.begin());
|
||||
std::copy(BEGIN(b), END(b), h_b.begin());
|
||||
std::copy(BEGIN(c), END(c), h_c.begin());
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::copy()
|
||||
{
|
||||
// c[i] = a[i]
|
||||
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
|
||||
std::copy(exe_policy, a, a + array_size, c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDIndicesStream<T>::mul()
|
||||
{
|
||||
// b[i] = scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
|
||||
return scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -98,7 +74,7 @@ template <class T>
|
||||
void STDIndicesStream<T>::add()
|
||||
{
|
||||
// c[i] = a[i] + b[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
|
||||
return a[i] + b[i];
|
||||
});
|
||||
}
|
||||
@ -107,7 +83,7 @@ template <class T>
|
||||
void STDIndicesStream<T>::triad()
|
||||
{
|
||||
// a[i] = b[i] + scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
return b[i] + scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -119,7 +95,7 @@ void STDIndicesStream<T>::nstream()
|
||||
// Need to do in two stages with C++11 STL.
|
||||
// 1: a[i] += b[i]
|
||||
// 2: a[i] += scalar * c[i];
|
||||
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
|
||||
return a[i] + b[i] + scalar * c[i];
|
||||
});
|
||||
}
|
||||
@ -129,7 +105,7 @@ template <class T>
|
||||
T STDIndicesStream<T>::dot()
|
||||
{
|
||||
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
|
||||
return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -148,6 +124,3 @@ std::string getDeviceDriver(const int)
|
||||
}
|
||||
template class STDIndicesStream<float>;
|
||||
template class STDIndicesStream<double>;
|
||||
|
||||
#undef BEGIN
|
||||
#undef END
|
||||
|
||||
@ -77,12 +77,7 @@ class STDIndicesStream : public Stream<T>
|
||||
ranged<int> range;
|
||||
|
||||
// Device side pointers
|
||||
#ifdef USE_VECTOR
|
||||
std::vector<T> a, b, c;
|
||||
#else
|
||||
T *a, *b, *c;
|
||||
#endif
|
||||
|
||||
|
||||
public:
|
||||
STDIndicesStream(const int, int) noexcept;
|
||||
|
||||
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection"
|
||||
"c++")
|
||||
|
||||
register_flag_optional(USE_VECTOR
|
||||
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
|
||||
"OFF")
|
||||
|
||||
register_flag_optional(NVHPC_OFFLOAD
|
||||
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
|
||||
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
|
||||
@ -47,9 +43,6 @@ macro(setup)
|
||||
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
|
||||
register_append_link_flags(${NVHPC_FLAGS})
|
||||
endif ()
|
||||
if (USE_VECTOR)
|
||||
register_definitions(USE_VECTOR)
|
||||
endif ()
|
||||
if (USE_TBB)
|
||||
register_link_library(TBB::tbb)
|
||||
endif ()
|
||||
|
||||
@ -5,27 +5,16 @@
|
||||
// source code
|
||||
|
||||
#include "STDRangesStream.hpp"
|
||||
#include <ranges>
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
#ifdef USE_VECTOR
|
||||
#define BEGIN(x) (x).begin()
|
||||
#define END(x) (x).end()
|
||||
#else
|
||||
#define BEGIN(x) (x)
|
||||
#define END(x) ((x) + array_size)
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
|
||||
noexcept : array_size{ARRAY_SIZE},
|
||||
#ifdef USE_VECTOR
|
||||
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||
#else
|
||||
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
|
||||
#endif
|
||||
{
|
||||
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
|
||||
#ifdef USE_ONEDPL
|
||||
@ -45,11 +34,9 @@ noexcept : array_size{ARRAY_SIZE},
|
||||
|
||||
template<class T>
|
||||
STDRangesStream<T>::~STDRangesStream() {
|
||||
#ifndef USE_VECTOR
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
#endif
|
||||
dealloc_raw(a);
|
||||
dealloc_raw(b);
|
||||
dealloc_raw(c);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
@ -70,9 +57,9 @@ template <class T>
|
||||
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
// Element-wise copy.
|
||||
std::copy(BEGIN(a), END(a), h_a.begin());
|
||||
std::copy(BEGIN(b), END(b), h_b.begin());
|
||||
std::copy(BEGIN(c), END(c), h_c.begin());
|
||||
std::copy(a, a + array_size, h_a.begin());
|
||||
std::copy(b, b + array_size, h_b.begin());
|
||||
std::copy(c, c + array_size, h_c.begin());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
@ -148,7 +135,7 @@ T STDRangesStream<T>::dot()
|
||||
return
|
||||
std::transform_reduce(
|
||||
exe_policy,
|
||||
BEGIN(a), END(a), BEGIN(b), 0.0);
|
||||
a, a + array_size, b, 0.0);
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
@ -168,6 +155,3 @@ std::string getDeviceDriver(const int)
|
||||
|
||||
template class STDRangesStream<float>;
|
||||
template class STDRangesStream<double>;
|
||||
|
||||
#undef BEGIN
|
||||
#undef END
|
||||
|
||||
@ -21,11 +21,7 @@ class STDRangesStream : public Stream<T>
|
||||
int array_size;
|
||||
|
||||
// Device side pointers
|
||||
#ifdef USE_VECTOR
|
||||
std::vector<T> a, b, c;
|
||||
#else
|
||||
T *a, *b, *c;
|
||||
#endif
|
||||
|
||||
public:
|
||||
STDRangesStream(const int, int) noexcept;
|
||||
|
||||
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
|
||||
"c++")
|
||||
|
||||
register_flag_optional(USE_VECTOR
|
||||
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
|
||||
"OFF")
|
||||
|
||||
register_flag_optional(USE_TBB
|
||||
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
|
||||
"OFF")
|
||||
@ -32,10 +28,7 @@ macro(setup)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED OFF)
|
||||
unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
|
||||
# and append our own:
|
||||
register_append_cxx_flags(ANY -std=c++2a)
|
||||
if (USE_VECTOR)
|
||||
register_definitions(USE_VECTOR)
|
||||
endif ()
|
||||
register_append_cxx_flags(ANY -std=c++20)
|
||||
if (USE_TBB)
|
||||
register_link_library(TBB::tbb)
|
||||
endif ()
|
||||
@ -44,3 +37,10 @@ macro(setup)
|
||||
register_link_library(oneDPL)
|
||||
endif ()
|
||||
endmacro()
|
||||
|
||||
macro(setup_target NAME)
|
||||
if (USE_ONEDPL)
|
||||
target_compile_features(${NAME} INTERFACE cxx_std_20)
|
||||
target_compile_features(oneDPL INTERFACE cxx_std_20)
|
||||
endif ()
|
||||
endmacro()
|
||||
|
||||
@ -46,11 +46,12 @@ macro(setup)
|
||||
# see CUDA.cmake, we're only adding a few Thrust related libraries here
|
||||
|
||||
if (POLICY CMP0104)
|
||||
cmake_policy(SET CMP0104 OLD)
|
||||
cmake_policy(SET CMP0104 NEW)
|
||||
endif ()
|
||||
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
|
||||
# add -forward-unknown-to-host-compiler for compatibility reasons
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
|
||||
enable_language(CUDA)
|
||||
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
|
||||
# appended later
|
||||
@ -63,6 +64,7 @@ macro(setup)
|
||||
# XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
|
||||
# same thing for thrust
|
||||
if (SDK_DIR)
|
||||
list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
|
||||
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
|
||||
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
|
||||
else ()
|
||||
@ -73,9 +75,11 @@ macro(setup)
|
||||
message(STATUS "Using Thrust backend: ${BACKEND}")
|
||||
|
||||
# this creates the interface that we can link to
|
||||
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
|
||||
thrust_create_target(Thrust${BACKEND}
|
||||
HOST CPP
|
||||
DEVICE ${BACKEND})
|
||||
|
||||
register_link_library(Thrust)
|
||||
register_link_library(Thrust${BACKEND})
|
||||
elseif (${THRUST_IMPL} STREQUAL "ROCM")
|
||||
if (SDK_DIR)
|
||||
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user