Fix up CI, resolves #145, supersedes #154

Drop vector for std-* models
This commit is contained in:
Tom Lin 2023-09-24 21:11:35 +01:00
parent 3dcafd1af1
commit 72be9f6980
15 changed files with 278 additions and 250 deletions

View File

@ -12,7 +12,7 @@ on:
jobs:
test-rust:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/rust/rust-stream
@ -28,7 +28,7 @@ jobs:
run: ./target/release/rust-stream --arraysize 2048
test-java:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/java/java-stream
@ -41,7 +41,7 @@ jobs:
run: java -jar target/java-stream.jar --arraysize 2048
test-julia:
runs-on: ubuntu-18.04
runs-on: ubuntu-22.04
defaults:
run:
working-directory: ./src/julia/JuliaStream.jl
@ -69,8 +69,24 @@ jobs:
run: julia --project src/AMDGPUStream.jl --list
setup-cpp:
runs-on: ubuntu-22.04
steps:
- name: Cache compiler
# if: ${{ !env.ACT }}
id: prepare-compilers
uses: actions/cache@v2
with:
path: ./compilers
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
- name: Prepare compilers
if: steps.prepare-compilers.outputs.cache-hit != 'true'
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
test-cpp:
runs-on: ubuntu-18.04
needs: setup-cpp
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v2
@ -84,15 +100,15 @@ jobs:
- name: Prepare compilers
if: steps.prepare-compilers.outputs.cache-hit != 'true'
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
- name: Setup test environment
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
# Enable tmate debugging of manually-triggered workflows if the input option was provided
- name: Setup tmate session
uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3
# if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
- name: Test compile gcc @ CMake 3.13
if: ${{ ! cancelled() }}
@ -167,4 +183,54 @@ jobs:
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile hipsycl @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile gcc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile clang @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile nvhpc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aocc @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile aomp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hip @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile dpcpp @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile hipsycl @ CMake 3.20
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
- name: Test compile gcc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile clang @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile nvhpc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aocc @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile aomp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hip @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile dpcpp @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
- name: Test compile hipsycl @ CMake 3.24
if: ${{ ! cancelled() }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}

View File

@ -1,5 +1,9 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()
project(BabelStream VERSION 4.0 LANGUAGES CXX)
# uncomment for debugging build issues:
@ -71,15 +75,19 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
# Honor user's CXX_EXTRA_LINK_FLAGS
set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that
option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
don't explicitly link against TBB is a no-op, see description of your selected
model on how this is used." OFF)
if (USE_TBB)
option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
if (FETCH_TBB)
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
GIT_TAG v2021.9.0
GIT_TAG "${FETCH_TBB_VERSION}"
)
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
@ -92,15 +100,19 @@ if (USE_TBB)
endif ()
endif ()
option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that
option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
don't explicitly link against DPL is a no-op, see description of your selected
model on how this is used." OFF)
if (USE_ONEDPL)
option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
if (FETCH_ONEDPL)
FetchContent_Declare(
oneDPL
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
GIT_TAG oneDPL-2022.2.0-rc1
GIT_TAG "${FETCH_ONEDPL_VERSION}"
)
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package

View File

@ -83,6 +83,8 @@ get() {
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
else
echo "$name found, skipping download..."
fi
fi
}
@ -92,13 +94,15 @@ get_and_untar() {
local pkg_url="$2"
if [ "$SETUP" = true ]; then
if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
echo "$name not found, downloading..."
echo "$name not found, downloading ($pkg_url)..."
wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
fi
echo "Preparing to extract $name ..."
tar -xf "$name"
echo "$name extracted, deleting archive ..."
rm -f "$name" # delete for space
else
echo "Skipping setup for $name ($pkg_url)..."
fi
}
@ -119,10 +123,10 @@ verify_dir_exists() {
setup_aocc() {
echo "Preparing AOCC"
local aocc_ver="2.3.0"
local aocc_ver="4.0.0"
local tarball="aocc-$aocc_ver.tar.xz"
# XXX it's actually XZ compressed, so it should be tar.xz
local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
# local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
get_and_untar "$tarball" "$AOCC_URL"
@ -133,10 +137,10 @@ setup_aocc() {
}
setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local nvhpc_ver="22.3"
local nvhpc_release="2022_223"
local cuda_ver="11.6"
echo "Preparing Nvidia HPC SDK"
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
local nvhpc_release="2023_231"
local cuda_ver="12.0"
local tarball="nvhpc_$nvhpc_ver.tar.gz"
@ -145,7 +149,7 @@ setup_nvhpc() {
local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x
"$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
@ -166,7 +170,8 @@ setup_nvhpc() {
setup_aomp() {
echo "Preparing AOMP"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
local aomp_ver="18.0-0"
local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
# local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
get_and_install_deb "aomp" "aomp" "$AOMP_URL"
@ -189,9 +194,10 @@ setup_oclcpu() {
setup_kokkos() {
echo "Preparing Kokkos"
local kokkos_ver="3.3.01"
local kokkos_ver="4.1.00"
local tarball="kokkos-$kokkos_ver.tar.gz"
local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
# local url="http://localhost:8000/$kokkos_ver.tar.gz"
@ -203,10 +209,10 @@ setup_kokkos() {
setup_raja() {
echo "Preparing RAJA"
local raja_ver="0.13.0"
local raja_ver="2023.06.1"
local tarball="raja-$raja_ver.tar.gz"
local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
# local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
get_and_untar "$tarball" "$url"
@ -217,7 +223,7 @@ setup_raja() {
setup_tbb() {
echo "Preparing TBB"
local tbb_ver="2021.2.0"
local tbb_ver="2021.9.0"
local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
@ -231,9 +237,9 @@ setup_tbb() {
setup_clang_gcc() {
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
export_var GCC_CXX "$(which g++-10)"
export_var GCC_CXX "$(which g++-12)"
verify_bin_exists "$GCC_CXX"
"$GCC_CXX" --version
@ -254,7 +260,7 @@ setup_clang_gcc() {
}
setup_rocm() {
sudo apt-get install -y -qq rocm-dev rocthrust-dev
sudo apt-get install -y rocm-dev rocthrust-dev
export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
@ -265,7 +271,7 @@ setup_rocm() {
setup_dpcpp() {
local nightly="20210106"
local nightly="20230615"
local tarball="dpcpp-$nightly.tar.gz"
local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
@ -282,22 +288,22 @@ setup_dpcpp() {
setup_hipsycl() {
sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
local hipsycl_ver="0.9.0"
local hipsycl_ver="0.9.1"
local tarball="v$hipsycl_ver.tar.gz"
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
get_and_untar "$tarball" "$url"
if [ "$SETUP" = true ]; then
local src="$PWD/hipSYCL-$hipsycl_ver"
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
rm -rf "$src/build"
rm -rf "$install_dir"
cmake "-B$src/build" "-H$src" \
-DCMAKE_C_COMPILER="$(which gcc-10)" \
-DCMAKE_CXX_COMPILER="$(which g++-10)" \
-DCMAKE_C_COMPILER="$(which gcc-12)" \
-DCMAKE_CXX_COMPILER="$(which g++-12)" \
-DCMAKE_INSTALL_PREFIX="$install_dir" \
-DWITH_ROCM_BACKEND=OFF \
-DWITH_CUDA_BACKEND=OFF \
@ -312,25 +318,20 @@ setup_hipsycl() {
check_size
}
setup_computecpp() {
echo "TODO ComputeCpp requires registration+login to download"
}
if [ "${GITHUB_ACTIONS:-false}" = true ]; then
echo "Running in GitHub Actions, defaulting to special export"
TERM=xterm
export TERM=xterm
# drop the lock in case we got one from a failed run
rm /var/lib/dpkg/lock-frontend || true
rm /var/cache/apt/archives/lock || true
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
rm -rf /var/lib/dpkg/lock-frontend || true
rm -rf /var/cache/apt/archives/lock || true
mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt-get update -qq
sudo apt-get install -y -qq cmake
sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
if [ "$SETUP" = true ]; then
echo "Deleting extra packages for space in 2 seconds..."
@ -340,6 +341,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
sudo apt-get autoremove -y
check_size
fi
sudo apt-get upgrade -qq
else
echo "Running locally, defaulting to standard export"
fi
@ -368,6 +370,18 @@ setup_cmake() {
verify_bin_exists "$CMAKE_3_18_BIN"
"$CMAKE_3_18_BIN" --version
get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_20_BIN"
"$CMAKE_3_20_BIN" --version
get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
verify_bin_exists "$CMAKE_3_24_BIN"
"$CMAKE_3_24_BIN" --version
check_size
}
@ -385,6 +399,10 @@ if [ "$PARALLEL" = true ]; then
setup_tbb &
wait
else
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
setup_cmake
setup_aocc
setup_oclcpu
@ -394,10 +412,6 @@ else
setup_kokkos
setup_raja
setup_tbb
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
fi
echo "Done!"

View File

@ -120,10 +120,21 @@ run_build() {
# CLANG_OMP_OFFLOAD_NVIDIA=false
###
NV_ARCH_CC="70"
AMD_ARCH="gfx_903"
NV_ARCH="sm_70"
NV_ARCH="sm_${NV_ARCH_CC}"
NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
check_cmake_ver(){
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required=$1
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
return 0
else
return 1
fi
}
build_gcc() {
local name="gcc_build"
local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
@ -138,14 +149,12 @@ build_gcc() {
for use_onedpl in OFF OPENMP TBB; do
case "$use_onedpl" in
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;;
*) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
for use_vector in OFF ON; do
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
done
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
done
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@ -153,40 +162,45 @@ build_gcc() {
run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
fi
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
if check_cmake_ver "3.16.0"; then
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
if check_cmake_ver "3.20.0"; then
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
if check_cmake_ver "3.20.0"; then
run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
-DENABLE_CUDA=ON \
-DTARGET=NVIDIA \
-DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
-DCUDA_ARCH=$NV_ARCH"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
# -DENABLE_CUDA=ON \
# -DTARGET=NVIDIA \
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
@ -198,7 +212,7 @@ build_gcc() {
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
echo "Skipping Thrust models due to CMake version requirement"
fi
}
@ -216,30 +230,39 @@ build_clang() {
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
if check_cmake_ver "3.16.0"; then
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
for use_onedpl in OFF OPENMP TBB; do
for use_vector in OFF ON; do
case "$use_onedpl" in
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
*) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector "
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported
done
case "$use_onedpl" in
OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
*) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
esac
run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
# run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
done
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
if check_cmake_ver "3.20.0"; then
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
}
@ -249,10 +272,6 @@ build_nvhpc() {
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
# std again but with vectors
run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
}
@ -291,15 +310,18 @@ build_icpc() {
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
run_build $name "${ICPC_CXX:?}" omp "$cxx"
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
}
if check_cmake_ver "3.20.0"; then
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
else
echo "Skipping RAJA models due to CMake version requirement"
fi
if check_cmake_ver "3.16.0"; then
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
else
echo "Skipping Kokkos models due to CMake version requirement"
fi
build_computecpp() {
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
-DSYCL_COMPILER=COMPUTECPP \
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
-DOpenCL_LIBRARY=${OCL_LIB:?}"
}
build_dpcpp() {

View File

@ -8,8 +8,6 @@ register_flag_optional(RAJA_IN_TREE
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options:
-DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ...
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
" "")

View File

@ -6,22 +6,10 @@
#include "STDDataStream.h"
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
@ -41,55 +29,53 @@ STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
template<class T>
STDDataStream<T>::~STDDataStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, BEGIN(c), END(c), initC);
std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c, c + array_size, initC);
}
template <class T>
void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
void STDDataStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
std::copy(exe_policy, a, a + array_size, c);
}
template <class T>
void STDDataStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
}
template <class T>
void STDDataStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
}
template <class T>
void STDDataStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
}
template <class T>
@ -99,8 +85,8 @@ void STDDataStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
}
@ -108,7 +94,7 @@ template <class T>
T STDDataStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
}
void listDevices(void)
@ -127,6 +113,3 @@ std::string getDeviceDriver(const int)
}
template class STDDataStream<float>;
template class STDDataStream<double>;
#undef BEGIN
#undef END

View File

@ -22,12 +22,7 @@ class STDDataStream : public Stream<T>
int array_size;
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:
STDDataStream(const int, int) noexcept;

View File

@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -47,9 +43,6 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if (USE_VECTOR)
register_definitions(USE_VECTOR)
endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()

View File

@ -10,32 +10,10 @@
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
#ifdef USE_VECTOR
#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__))
#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures."
#endif
#if defined(USE_ONEDPL)
#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures"
#endif
#endif
template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
@ -55,41 +33,39 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
template<class T>
STDIndicesStream<T>::~STDIndicesStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, BEGIN(a), END(a), initA);
std::fill(exe_policy, BEGIN(b), END(b), initB);
std::fill(exe_policy, BEGIN(c), END(c), initC);
std::fill(exe_policy, a, a + array_size, initA);
std::fill(exe_policy, b, b + array_size, initB);
std::fill(exe_policy, c, c + array_size, initC);
}
template <class T>
void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
void STDIndicesStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
std::copy(exe_policy, a, a + array_size, c);
}
template <class T>
void STDIndicesStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
return scalar * c[i];
});
}
@ -98,7 +74,7 @@ template <class T>
void STDIndicesStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) {
std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
return a[i] + b[i];
});
}
@ -107,7 +83,7 @@ template <class T>
void STDIndicesStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
return b[i] + scalar * c[i];
});
}
@ -119,7 +95,7 @@ void STDIndicesStream<T>::nstream()
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
return a[i] + b[i] + scalar * c[i];
});
}
@ -129,7 +105,7 @@ template <class T>
T STDIndicesStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
}
void listDevices(void)
@ -148,6 +124,3 @@ std::string getDeviceDriver(const int)
}
template class STDIndicesStream<float>;
template class STDIndicesStream<double>;
#undef BEGIN
#undef END

View File

@ -77,12 +77,7 @@ class STDIndicesStream : public Stream<T>
ranged<int> range;
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:
STDIndicesStream(const int, int) noexcept;

View File

@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(NVHPC_OFFLOAD
"Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -47,9 +43,6 @@ macro(setup)
register_append_cxx_flags(ANY ${NVHPC_FLAGS})
register_append_link_flags(${NVHPC_FLAGS})
endif ()
if (USE_VECTOR)
register_definitions(USE_VECTOR)
endif ()
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()

View File

@ -5,27 +5,16 @@
// source code
#include "STDRangesStream.hpp"
#include <ranges>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
#else
#define BEGIN(x) (x)
#define END(x) ((x) + array_size)
#endif
template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
#else
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#ifdef USE_ONEDPL
@ -45,11 +34,9 @@ noexcept : array_size{ARRAY_SIZE},
template<class T>
STDRangesStream<T>::~STDRangesStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
}
template <class T>
@ -70,9 +57,9 @@ template <class T>
void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
// Element-wise copy.
std::copy(BEGIN(a), END(a), h_a.begin());
std::copy(BEGIN(b), END(b), h_b.begin());
std::copy(BEGIN(c), END(c), h_c.begin());
std::copy(a, a + array_size, h_a.begin());
std::copy(b, b + array_size, h_b.begin());
std::copy(c, c + array_size, h_c.begin());
}
template <class T>
@ -148,7 +135,7 @@ T STDRangesStream<T>::dot()
return
std::transform_reduce(
exe_policy,
BEGIN(a), END(a), BEGIN(b), 0.0);
a, a + array_size, b, 0.0);
}
void listDevices(void)
@ -168,6 +155,3 @@ std::string getDeviceDriver(const int)
template class STDRangesStream<float>;
template class STDRangesStream<double>;
#undef BEGIN
#undef END

View File

@ -21,11 +21,7 @@ class STDRangesStream : public Stream<T>
int array_size;
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
#else
T *a, *b, *c;
#endif
public:
STDRangesStream(const int, int) noexcept;

View File

@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
"c++")
register_flag_optional(USE_VECTOR
"Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
"OFF")
register_flag_optional(USE_TBB
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
@ -32,10 +28,7 @@ macro(setup)
set(CMAKE_CXX_STANDARD_REQUIRED OFF)
unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
# and append our own:
register_append_cxx_flags(ANY -std=c++2a)
if (USE_VECTOR)
register_definitions(USE_VECTOR)
endif ()
register_append_cxx_flags(ANY -std=c++20)
if (USE_TBB)
register_link_library(TBB::tbb)
endif ()
@ -44,3 +37,10 @@ macro(setup)
register_link_library(oneDPL)
endif ()
endmacro()
macro(setup_target NAME)
if (USE_ONEDPL)
target_compile_features(${NAME} INTERFACE cxx_std_20)
target_compile_features(oneDPL INTERFACE cxx_std_20)
endif ()
endmacro()

View File

@ -46,11 +46,12 @@ macro(setup)
# see CUDA.cmake, we're only adding a few Thrust related libraries here
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
cmake_policy(SET CMP0104 NEW)
endif ()
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
@ -63,6 +64,7 @@ macro(setup)
# XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
# same thing for thrust
if (SDK_DIR)
list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
@ -73,9 +75,11 @@ macro(setup)
message(STATUS "Using Thrust backend: ${BACKEND}")
# this creates the interface that we can link to
thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
thrust_create_target(Thrust${BACKEND}
HOST CPP
DEVICE ${BACKEND})
register_link_library(Thrust)
register_link_library(Thrust${BACKEND})
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)