From 72be9f698035b542bc1734dcbbd14543db55e244 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 21:11:35 +0100 Subject: [PATCH] Fix up CI, resolves #145, supersedes #154 Drop vector for std-* models --- .github/workflows/main.yaml | 84 +++++++++++++++-- CMakeLists.txt | 24 +++-- src/ci-prepare-bionic.sh | 92 +++++++++++-------- src/ci-test-compile.sh | 132 ++++++++++++++++----------- src/raja/model.cmake | 2 - src/std-data/STDDataStream.cpp | 49 ++++------ src/std-data/STDDataStream.h | 5 - src/std-data/model.cmake | 7 -- src/std-indices/STDIndicesStream.cpp | 57 +++--------- src/std-indices/STDIndicesStream.h | 5 - src/std-indices/model.cmake | 7 -- src/std-ranges/STDRangesStream.cpp | 32 ++----- src/std-ranges/STDRangesStream.hpp | 4 - src/std-ranges/model.cmake | 16 ++-- src/thrust/model.cmake | 12 ++- 15 files changed, 278 insertions(+), 250 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 08eed2d..8dc6905 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -12,7 +12,7 @@ on: jobs: test-rust: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/rust/rust-stream @@ -28,7 +28,7 @@ jobs: run: ./target/release/rust-stream --arraysize 2048 test-java: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/java/java-stream @@ -41,7 +41,7 @@ jobs: run: java -jar target/java-stream.jar --arraysize 2048 test-julia: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/julia/JuliaStream.jl @@ -69,8 +69,24 @@ jobs: run: julia --project src/AMDGPUStream.jl --list + setup-cpp: + runs-on: ubuntu-22.04 + steps: + - name: Cache compiler + # if: ${{ !env.ACT }} + id: prepare-compilers + uses: actions/cache@v2 + with: + path: ./compilers + key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} + + - name: Prepare compilers + if: steps.prepare-compilers.outputs.cache-hit != 'true' + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true + test-cpp: - runs-on: ubuntu-18.04 + needs: setup-cpp + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -84,15 +100,15 @@ jobs: - name: Prepare compilers if: steps.prepare-compilers.outputs.cache-hit != 'true' - run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true - name: Setup test environment run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true # Enable tmate debugging of manually-triggered workflows if the input option was provided - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} + # - name: Setup tmate session + # uses: mxschmitt/action-tmate@v3 + # if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - name: Test compile gcc @ CMake 3.13 if: ${{ ! cancelled() }} @@ -167,4 +183,54 @@ jobs: run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }} - name: Test compile hipsycl @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} \ No newline at end of file + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} + + - name: Test compile gcc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile clang @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile nvhpc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aocc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aomp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hip @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile dpcpp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hipsycl @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }} + + - name: Test compile gcc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile clang @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile nvhpc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aocc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aomp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hip @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile dpcpp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hipsycl @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index da112a4..879e463 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif () + project(BabelStream VERSION 4.0 LANGUAGES CXX) # uncomment for debugging build issues: @@ -71,15 +75,19 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS " # Honor user's CXX_EXTRA_LINK_FLAGS set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) -option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that +option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that don't explicitly link against TBB is a no-op, see description of your selected model on how this is used." OFF) -if (USE_TBB) +option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF) +set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON") + +if (FETCH_TBB) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG v2021.9.0 + GIT_TAG "${FETCH_TBB_VERSION}" ) # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) @@ -92,15 +100,19 @@ if (USE_TBB) endif () endif () -option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that +option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that don't explicitly link against DPL is a no-op, see description of your selected model on how this is used." OFF) -if (USE_ONEDPL) +option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF) +set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON") + +if (FETCH_ONEDPL) FetchContent_Declare( oneDPL GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git - GIT_TAG oneDPL-2022.2.0-rc1 + GIT_TAG "${FETCH_ONEDPL_VERSION}" ) string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 78bbd33..6a1a959 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -83,6 +83,8 @@ get() { if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then echo "$name not found, downloading..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" + else + echo "$name found, skipping download..." fi fi } @@ -92,13 +94,15 @@ get_and_untar() { local pkg_url="$2" if [ "$SETUP" = true ]; then if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then - echo "$name not found, downloading..." + echo "$name not found, downloading ($pkg_url)..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" fi echo "Preparing to extract $name ..." tar -xf "$name" echo "$name extracted, deleting archive ..." rm -f "$name" # delete for space + else + echo "Skipping setup for $name ($pkg_url)..." fi } @@ -119,10 +123,10 @@ verify_dir_exists() { setup_aocc() { echo "Preparing AOCC" - local aocc_ver="2.3.0" + local aocc_ver="4.0.0" local tarball="aocc-$aocc_ver.tar.xz" # XXX it's actually XZ compressed, so it should be tar.xz - local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar" + local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar" # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar" get_and_untar "$tarball" "$AOCC_URL" @@ -133,10 +137,10 @@ setup_aocc() { } setup_nvhpc() { - echo "Preparing Nvidia HPC SDK" - local nvhpc_ver="22.3" - local nvhpc_release="2022_223" - local cuda_ver="11.6" + echo "Preparing Nvidia HPC SDK" + local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A + local nvhpc_release="2023_231" + local cuda_ver="12.0" local tarball="nvhpc_$nvhpc_ver.tar.gz" @@ -145,7 +149,7 @@ setup_nvhpc() { local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver" local bin_dir="$sdk_dir/compilers/bin" - "$bin_dir/makelocalrc" "$bin_dir" -x + "$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12 export_var NVHPC_SDK_DIR "$sdk_dir" export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver" @@ -166,7 +170,8 @@ setup_nvhpc() { setup_aomp() { echo "Preparing AOMP" - local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb" + local aomp_ver="18.0-0" + local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb" # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb" get_and_install_deb "aomp" "aomp" "$AOMP_URL" @@ -189,9 +194,10 @@ setup_oclcpu() { setup_kokkos() { echo "Preparing Kokkos" - local kokkos_ver="3.3.01" + local kokkos_ver="4.1.00" local tarball="kokkos-$kokkos_ver.tar.gz" + local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz" # local url="http://localhost:8000/$kokkos_ver.tar.gz" @@ -203,10 +209,10 @@ setup_kokkos() { setup_raja() { echo "Preparing RAJA" - local raja_ver="0.13.0" + local raja_ver="2023.06.1" local tarball="raja-$raja_ver.tar.gz" - local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz" + local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz" # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz" get_and_untar "$tarball" "$url" @@ -217,7 +223,7 @@ setup_raja() { setup_tbb() { echo "Preparing TBB" - local tbb_ver="2021.2.0" + local tbb_ver="2021.9.0" local tarball="oneapi-tbb-$tbb_ver-lin.tgz" local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" @@ -231,9 +237,9 @@ setup_tbb() { setup_clang_gcc() { - sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev + sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6 - export_var GCC_CXX "$(which g++-10)" + export_var GCC_CXX "$(which g++-12)" verify_bin_exists "$GCC_CXX" "$GCC_CXX" --version @@ -254,7 +260,7 @@ setup_clang_gcc() { } setup_rocm() { - sudo apt-get install -y -qq rocm-dev rocthrust-dev + sudo apt-get install -y rocm-dev rocthrust-dev export_var ROCM_PATH "/opt/rocm" export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" @@ -265,7 +271,7 @@ setup_rocm() { setup_dpcpp() { - local nightly="20210106" + local nightly="20230615" local tarball="dpcpp-$nightly.tar.gz" local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz" @@ -282,22 +288,22 @@ setup_dpcpp() { setup_hipsycl() { sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev - local hipsycl_ver="0.9.0" + local hipsycl_ver="0.9.1" local tarball="v$hipsycl_ver.tar.gz" local install_dir="$PWD/hipsycl_dist_$hipsycl_ver" - local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz" - # local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz" + local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz" + # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz" get_and_untar "$tarball" "$url" if [ "$SETUP" = true ]; then - local src="$PWD/hipSYCL-$hipsycl_ver" + local src="$PWD/AdaptiveCpp-$hipsycl_ver" rm -rf "$src/build" rm -rf "$install_dir" cmake "-B$src/build" "-H$src" \ - -DCMAKE_C_COMPILER="$(which gcc-10)" \ - -DCMAKE_CXX_COMPILER="$(which g++-10)" \ + -DCMAKE_C_COMPILER="$(which gcc-12)" \ + -DCMAKE_CXX_COMPILER="$(which g++-12)" \ -DCMAKE_INSTALL_PREFIX="$install_dir" \ -DWITH_ROCM_BACKEND=OFF \ -DWITH_CUDA_BACKEND=OFF \ @@ -312,25 +318,20 @@ setup_hipsycl() { check_size } -setup_computecpp() { - echo "TODO ComputeCpp requires registration+login to download" -} - if [ "${GITHUB_ACTIONS:-false}" = true ]; then echo "Running in GitHub Actions, defaulting to special export" TERM=xterm export TERM=xterm - # drop the lock in case we got one from a failed run - rm /var/lib/dpkg/lock-frontend || true - rm /var/cache/apt/archives/lock || true - - wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list + rm -rf /var/lib/dpkg/lock-frontend || true + rm -rf /var/cache/apt/archives/lock || true + mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 sudo apt-get update -qq - sudo apt-get install -y -qq cmake + sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev if [ "$SETUP" = true ]; then echo "Deleting extra packages for space in 2 seconds..." @@ -340,6 +341,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then sudo apt-get autoremove -y check_size fi + sudo apt-get upgrade -qq else echo "Running locally, defaulting to standard export" fi @@ -368,6 +370,18 @@ setup_cmake() { verify_bin_exists "$CMAKE_3_18_BIN" "$CMAKE_3_18_BIN" --version + get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh" + chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir + export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_20_BIN" + "$CMAKE_3_20_BIN" --version + + get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh" + chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir + export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_24_BIN" + "$CMAKE_3_24_BIN" --version + check_size } @@ -385,6 +399,10 @@ if [ "$PARALLEL" = true ]; then setup_tbb & wait else + # these need apt + setup_clang_gcc + setup_rocm + setup_hipsycl setup_cmake setup_aocc setup_oclcpu @@ -394,10 +412,6 @@ else setup_kokkos setup_raja setup_tbb - # these need apt - setup_clang_gcc - setup_rocm - setup_hipsycl fi echo "Done!" diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index d3fc5b7..610c3f0 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -120,10 +120,21 @@ run_build() { # CLANG_OMP_OFFLOAD_NVIDIA=false ### +NV_ARCH_CC="70" AMD_ARCH="gfx_903" -NV_ARCH="sm_70" +NV_ARCH="sm_${NV_ARCH_CC}" NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80" +check_cmake_ver(){ + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required=$1 + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + return 0 + else + return 1 + fi +} + build_gcc() { local name="gcc_build" local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}" @@ -138,14 +149,12 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac - for use_vector in OFF ON; do - # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - done + # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here + run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -153,40 +162,45 @@ build_gcc() { run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then - run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" + run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none" run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" fi if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then - run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" + run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none" run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" - run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" + run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi -# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 -# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 + if check_cmake_ver "3.20.0"; then + run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ + -DENABLE_CUDA=ON \ + -DTARGET=NVIDIA \ + -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ + -DCUDA_ARCH=$NV_ARCH" + else + echo "Skipping RAJA models due to CMake version requirement" + fi -# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ -# -DENABLE_CUDA=ON \ -# -DTARGET=NVIDIA \ -# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ -# -DCUDA_ARCH=$NV_ARCH" - - - # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements - local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) - local required="3.15.0" - if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" +# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" # FIXME CUDA Thrust + TBB throws the following error: # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined @@ -198,7 +212,7 @@ build_gcc() { # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB" else - echo "CMake version ${current} < ${required}, skipping Thrust models" + echo "Skipping Thrust models due to CMake version requirement" fi } @@ -216,30 +230,39 @@ build_clang() { run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" for use_onedpl in OFF OPENMP TBB; do - for use_vector in OFF ON; do - case "$use_onedpl" in - OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; - esac - run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " - run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported - done + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + esac + run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors - - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi # no clang /w RAJA+cuda because it needs nvcc which needs gcc } @@ -249,10 +272,6 @@ build_nvhpc() { run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - # std again but with vectors - run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" - run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" - run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } @@ -291,15 +310,18 @@ build_icpc() { local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" run_build $name "${ICPC_CXX:?}" omp "$cxx" run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" -} + if check_cmake_ver "3.20.0"; then + run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi + + if check_cmake_ver "3.16.0"; then + run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi -build_computecpp() { - run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ - -DSYCL_COMPILER=COMPUTECPP \ - -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \ - -DOpenCL_LIBRARY=${OCL_LIB:?}" } build_dpcpp() { diff --git a/src/raja/model.cmake b/src/raja/model.cmake index eb4788c..bf30631 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -8,8 +8,6 @@ register_flag_optional(RAJA_IN_TREE Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options: - -DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ... See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options " "") diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 3d7ef18..e426835 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -6,22 +6,10 @@ #include "STDDataStream.h" -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -41,55 +29,53 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) template STDDataStream::~STDDataStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, BEGIN(a), END(a), initA); - std::fill(exe_policy, BEGIN(b), END(b), initB); - std::fill(exe_policy, BEGIN(c), END(c), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + std::copy(exe_policy, a, a + array_size, c); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -99,8 +85,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -108,7 +94,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); } void listDevices(void) @@ -127,6 +113,3 @@ std::string getDeviceDriver(const int) } template class STDDataStream; template class STDDataStream; - -#undef BEGIN -#undef END diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 911a621..65e1ace 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -22,12 +22,7 @@ class STDDataStream : public Stream int array_size; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif - public: STDDataStream(const int, int) noexcept; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index e1697b6..e9e7099 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -47,9 +43,6 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 6e13597..1cf1ccc 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -10,32 +10,10 @@ #define ALIGNMENT (2*1024*1024) // 2MB #endif -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - -#ifdef USE_VECTOR -#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__)) -#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures." -#endif - -#if defined(USE_ONEDPL) -#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures" -#endif -#endif - template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -55,41 +33,39 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size), template STDIndicesStream::~STDIndicesStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, BEGIN(a), END(a), initA); - std::fill(exe_policy, BEGIN(b), END(b), initB); - std::fill(exe_policy, BEGIN(c), END(c), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + std::copy(exe_policy, a, a + array_size, c); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -98,7 +74,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { + std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); } @@ -107,7 +83,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -119,7 +95,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -129,7 +105,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); } void listDevices(void) @@ -148,6 +124,3 @@ std::string getDeviceDriver(const int) } template class STDIndicesStream; template class STDIndicesStream; - -#undef BEGIN -#undef END diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 0916ef2..ffab910 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -77,12 +77,7 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif - public: STDIndicesStream(const int, int) noexcept; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index c2fef28..60ef575 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -47,9 +43,6 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index e05a7d1..d497691 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -5,27 +5,16 @@ // source code #include "STDRangesStream.hpp" +#include #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -45,11 +34,9 @@ noexcept : array_size{ARRAY_SIZE}, template STDRangesStream::~STDRangesStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template @@ -70,9 +57,9 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template @@ -148,7 +135,7 @@ T STDRangesStream::dot() return std::transform_reduce( exe_policy, - BEGIN(a), END(a), BEGIN(b), 0.0); + a, a + array_size, b, 0.0); } void listDevices(void) @@ -168,6 +155,3 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; - -#undef BEGIN -#undef END diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 9d36d46..6e7c29c 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -21,11 +21,7 @@ class STDRangesStream : public Stream int array_size; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif public: STDRangesStream(const int, int) noexcept; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 35554c7..8f73501 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") @@ -32,10 +28,7 @@ macro(setup) set(CMAKE_CXX_STANDARD_REQUIRED OFF) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: - register_append_cxx_flags(ANY -std=c++2a) - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () + register_append_cxx_flags(ANY -std=c++20) if (USE_TBB) register_link_library(TBB::tbb) endif () @@ -44,3 +37,10 @@ macro(setup) register_link_library(oneDPL) endif () endmacro() + +macro(setup_target NAME) + if (USE_ONEDPL) + target_compile_features(${NAME} INTERFACE cxx_std_20) + target_compile_features(oneDPL INTERFACE cxx_std_20) + endif () +endmacro() diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 91821ef..6b82ef5 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -46,11 +46,12 @@ macro(setup) # see CUDA.cmake, we're only adding a few Thrust related libraries here if (POLICY CMP0104) - cmake_policy(SET CMP0104 OLD) + cmake_policy(SET CMP0104 NEW) endif () + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS}) enable_language(CUDA) # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later @@ -63,6 +64,7 @@ macro(setup) # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` # same thing for thrust if (SDK_DIR) + list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR}) find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) else () @@ -73,9 +75,11 @@ macro(setup) message(STATUS "Using Thrust backend: ${BACKEND}") # this creates the interface that we can link to - thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + thrust_create_target(Thrust${BACKEND} + HOST CPP + DEVICE ${BACKEND}) - register_link_library(Thrust) + register_link_library(Thrust${BACKEND}) elseif (${THRUST_IMPL} STREQUAL "ROCM") if (SDK_DIR) find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)