Fix up CI, resolves #145, supersedes #154

Drop vector for std-* models
2023-09-24 21:11:35 +01:00 · 2023-09-24 21:11:35 +01:00 · 72be9f6980
commit 72be9f6980
parent 3dcafd1af1
15 changed files with 278 additions and 250 deletions
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@ -12,7 +12,7 @@ on:
 jobs:
  test-rust:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
    defaults:
      run:
        working-directory: ./src/rust/rust-stream
@ -28,7 +28,7 @@ jobs:
        run: ./target/release/rust-stream --arraysize 2048
  test-java:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
    defaults:
      run:
        working-directory: ./src/java/java-stream
@ -41,7 +41,7 @@ jobs:
        run: java -jar target/java-stream.jar --arraysize 2048
  test-julia:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
    defaults:
      run:
        working-directory: ./src/julia/JuliaStream.jl
@ -69,8 +69,24 @@ jobs:
        run: julia             --project src/AMDGPUStream.jl      --list
  setup-cpp:
    runs-on: ubuntu-22.04
    steps:
      - name: Cache compiler
        # if: ${{ !env.ACT }}
        id: prepare-compilers
        uses: actions/cache@v2
        with:
          path: ./compilers
          key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
      - name: Prepare compilers
        if: steps.prepare-compilers.outputs.cache-hit != 'true'
        run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
  test-cpp:
-    runs-on: ubuntu-18.04
+    needs: setup-cpp
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v2
@ -84,15 +100,15 @@ jobs:
      - name: Prepare compilers
        if: steps.prepare-compilers.outputs.cache-hit != 'true'
-        run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true
+        run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true
      - name: Setup test environment
        run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
        # Enable tmate debugging of manually-triggered workflows if the input option was provided
-      - name: Setup tmate session
+      #      - name: Setup tmate session
-        uses: mxschmitt/action-tmate@v3
+      #        uses: mxschmitt/action-tmate@v3
-        if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+      #        if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
      - name: Test compile gcc     @ CMake 3.13
        if: ${{ ! cancelled() }}
@ -167,4 +183,54 @@ jobs:
        run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile hipsycl @ CMake 3.18
        if: ${{ ! cancelled() }}
-        run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
+        run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile gcc     @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile clang   @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile nvhpc   @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile aocc    @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile aomp    @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile hip     @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile dpcpp   @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile hipsycl @ CMake 3.20
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }}
      - name: Test compile gcc     @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile clang   @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile nvhpc   @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile aocc    @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile aomp    @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile hip     @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile dpcpp   @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }}
      - name: Test compile hipsycl @ CMake 3.24
        if: ${{ ! cancelled() }}
        run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,9 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
    cmake_policy(SET CMP0135 NEW)
 endif ()
 project(BabelStream VERSION 4.0 LANGUAGES CXX)
 # uncomment for debugging build issues:
@ -71,15 +75,19 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS "
 # Honor user's CXX_EXTRA_LINK_FLAGS
 set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
-option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that
+option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that
                don't explicitly link against TBB is a no-op, see description of your selected
                model on how this is used." OFF)
-if (USE_TBB)
+option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's
                  FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF)
 set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON")
 if (FETCH_TBB)
    FetchContent_Declare(
            TBB
            GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
-            GIT_TAG v2021.9.0
+            GIT_TAG "${FETCH_TBB_VERSION}"
    )
    # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
@ -92,15 +100,19 @@ if (USE_TBB)
    endif ()
 endif ()
-option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that
+option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that
                   don't explicitly link against DPL is a no-op, see description of your selected
                   model on how this is used." OFF)
-if (USE_ONEDPL)
+option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's
                  FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF)
 set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON")
 if (FETCH_ONEDPL)
    FetchContent_Declare(
            oneDPL
            GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
-            GIT_TAG oneDPL-2022.2.0-rc1
+            GIT_TAG "${FETCH_ONEDPL_VERSION}"
    )
    string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
    # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
--- a/src/ci-prepare-bionic.sh
+++ b/src/ci-prepare-bionic.sh
@ -83,6 +83,8 @@ get() {
    if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
      echo "$name not found, downloading..."
      wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
    else
      echo "$name found, skipping download..."
    fi
  fi
 }
@ -92,13 +94,15 @@ get_and_untar() {
  local pkg_url="$2"
  if [ "$SETUP" = true ]; then
    if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
-      echo "$name not found, downloading..."
+      echo "$name not found, downloading ($pkg_url)..."
      wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
    fi
    echo "Preparing to extract $name ..."
    tar -xf "$name"
    echo "$name extracted, deleting archive ..."
    rm -f "$name" # delete for space
  else
    echo "Skipping setup for $name ($pkg_url)..."
  fi
 }
@ -119,10 +123,10 @@ verify_dir_exists() {
 setup_aocc() {
  echo "Preparing AOCC"
-  local aocc_ver="2.3.0"
+  local aocc_ver="4.0.0"
  local tarball="aocc-$aocc_ver.tar.xz"
  # XXX it's actually XZ compressed, so it should be tar.xz
-  local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
+  local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar"
  # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
  get_and_untar "$tarball" "$AOCC_URL"
@ -133,10 +137,10 @@ setup_aocc() {
 }
 setup_nvhpc() {
- echo "Preparing Nvidia HPC SDK"
+  echo "Preparing Nvidia HPC SDK"
-  local nvhpc_ver="22.3"
+  local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
-  local nvhpc_release="2022_223"
+  local nvhpc_release="2023_231"
-  local cuda_ver="11.6"
+  local cuda_ver="12.0"
  local tarball="nvhpc_$nvhpc_ver.tar.gz"
@ -145,7 +149,7 @@ setup_nvhpc() {
  local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver"
  local bin_dir="$sdk_dir/compilers/bin"
-  "$bin_dir/makelocalrc" "$bin_dir" -x
+  "$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12
  export_var NVHPC_SDK_DIR "$sdk_dir"
  export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver"
@ -166,7 +170,8 @@ setup_nvhpc() {
 setup_aomp() {
  echo "Preparing AOMP"
-  local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
+  local aomp_ver="18.0-0"
  local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb"
  # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
  get_and_install_deb "aomp" "aomp" "$AOMP_URL"
@ -189,9 +194,10 @@ setup_oclcpu() {
 setup_kokkos() {
  echo "Preparing Kokkos"
-  local kokkos_ver="3.3.01"
+  local kokkos_ver="4.1.00"
  local tarball="kokkos-$kokkos_ver.tar.gz"
  local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
  # local url="http://localhost:8000/$kokkos_ver.tar.gz"
@ -203,10 +209,10 @@ setup_kokkos() {
 setup_raja() {
  echo "Preparing RAJA"
-  local raja_ver="0.13.0"
+  local raja_ver="2023.06.1"
  local tarball="raja-$raja_ver.tar.gz"
-  local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
+  local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz"
  # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
  get_and_untar "$tarball" "$url"
@ -217,7 +223,7 @@ setup_raja() {
 setup_tbb() {
  echo "Preparing TBB"
-  local tbb_ver="2021.2.0"
+  local tbb_ver="2021.9.0"
  local tarball="oneapi-tbb-$tbb_ver-lin.tgz"
  local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz"
@ -231,9 +237,9 @@ setup_tbb() {
 setup_clang_gcc() {
-  sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
+  sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
-  export_var GCC_CXX "$(which g++-10)"
+  export_var GCC_CXX "$(which g++-12)"
  verify_bin_exists "$GCC_CXX"
  "$GCC_CXX" --version
@ -254,7 +260,7 @@ setup_clang_gcc() {
 }
 setup_rocm() {
-  sudo apt-get install -y -qq rocm-dev rocthrust-dev
+  sudo apt-get install -y rocm-dev rocthrust-dev
  export_var ROCM_PATH "/opt/rocm"
  export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
  export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
@ -265,7 +271,7 @@ setup_rocm() {
 setup_dpcpp() {
-  local nightly="20210106"
+  local nightly="20230615"
  local tarball="dpcpp-$nightly.tar.gz"
  local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
@ -282,22 +288,22 @@ setup_dpcpp() {
 setup_hipsycl() {
  sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
-  local hipsycl_ver="0.9.0"
+  local hipsycl_ver="0.9.1"
  local tarball="v$hipsycl_ver.tar.gz"
  local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
-  local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
+  local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
-  # local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
+  # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
  get_and_untar "$tarball" "$url"
  if [ "$SETUP" = true ]; then
-    local src="$PWD/hipSYCL-$hipsycl_ver"
+    local src="$PWD/AdaptiveCpp-$hipsycl_ver"
    rm -rf "$src/build"
    rm -rf "$install_dir"
    cmake "-B$src/build" "-H$src" \
-      -DCMAKE_C_COMPILER="$(which gcc-10)" \
+      -DCMAKE_C_COMPILER="$(which gcc-12)" \
-      -DCMAKE_CXX_COMPILER="$(which g++-10)" \
+      -DCMAKE_CXX_COMPILER="$(which g++-12)" \
      -DCMAKE_INSTALL_PREFIX="$install_dir" \
      -DWITH_ROCM_BACKEND=OFF \
      -DWITH_CUDA_BACKEND=OFF \
@ -312,25 +318,20 @@ setup_hipsycl() {
  check_size
 }
 setup_computecpp() {
  echo "TODO ComputeCpp requires registration+login to download"
 }
 if [ "${GITHUB_ACTIONS:-false}" = true ]; then
  echo "Running in GitHub Actions, defaulting to special export"
  TERM=xterm
  export TERM=xterm
  # drop the lock in case we got one from a failed run
-  rm /var/lib/dpkg/lock-frontend || true
+  rm -rf /var/lib/dpkg/lock-frontend || true
-  rm /var/cache/apt/archives/lock || true
+  rm -rf /var/cache/apt/archives/lock || true
  wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
  echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
  echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
  mkdir --parents --mode=0755 /etc/apt/keyrings
  wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
  echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list
  echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
  sudo apt-get update -qq
-  sudo apt-get install -y -qq cmake
+  sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev
  if [ "$SETUP" = true ]; then
    echo "Deleting extra packages for space in 2 seconds..."
@ -340,6 +341,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
    sudo apt-get autoremove -y
    check_size
  fi
  sudo apt-get upgrade -qq
 else
  echo "Running locally, defaulting to standard export"
 fi
@ -368,6 +370,18 @@ setup_cmake() {
  verify_bin_exists "$CMAKE_3_18_BIN"
  "$CMAKE_3_18_BIN" --version
  get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh"
  chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir
  export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake"
  verify_bin_exists "$CMAKE_3_20_BIN"
  "$CMAKE_3_20_BIN" --version
  get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh"
  chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir
  export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake"
  verify_bin_exists "$CMAKE_3_24_BIN"
  "$CMAKE_3_24_BIN" --version
  check_size
 }
@ -385,6 +399,10 @@ if [ "$PARALLEL" = true ]; then
  setup_tbb &
  wait
 else
  # these need apt
  setup_clang_gcc
  setup_rocm
  setup_hipsycl
  setup_cmake
  setup_aocc
  setup_oclcpu
@ -394,10 +412,6 @@ else
  setup_kokkos
  setup_raja
  setup_tbb
  # these need apt
  setup_clang_gcc
  setup_rocm
  setup_hipsycl
 fi
 echo "Done!"
--- a/src/ci-test-compile.sh
+++ b/src/ci-test-compile.sh
@ -120,10 +120,21 @@ run_build() {
 # CLANG_OMP_OFFLOAD_NVIDIA=false
 ###
 NV_ARCH_CC="70"
 AMD_ARCH="gfx_903"
-NV_ARCH="sm_70"
+NV_ARCH="sm_${NV_ARCH_CC}"
 NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80"
 check_cmake_ver(){
  local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
  local required=$1
  if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
    return 0
  else
    return 1
  fi
 }
 build_gcc() {
  local name="gcc_build"
  local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
@ -138,14 +149,12 @@ build_gcc() {
  for use_onedpl in OFF OPENMP TBB; do
    case "$use_onedpl" in
      OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"  ;;
-      *)   dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
+      *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
    esac
-    for use_vector in OFF ON; do
+    # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
-      # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
+    run_build $name "${GCC_CXX:?}" std-data    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-      run_build $name "${GCC_CXX:?}" std-data    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
+    run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-      run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
+    run_build $name "${GCC_CXX:?}" std-ranges  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
      run_build $name "${GCC_CXX:?}" std-ranges  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
    done
  done
  run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@ -153,40 +162,45 @@ build_gcc() {
  run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
  if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
-    run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
+    run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none"
    run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
  fi
  if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
-    run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
+    run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none"
    run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
  fi
  run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
  run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
  run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
-  #  run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
+  if check_cmake_ver "3.16.0"; then
-  run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
+    #  run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
    run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  else
    echo "Skipping Kokkos models due to CMake version requirement"
  fi
  run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
-  run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
+  if check_cmake_ver "3.20.0"; then
    run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
  else
    echo "Skipping RAJA models due to CMake version requirement"
  fi
-#  FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
+  if check_cmake_ver "3.20.0"; then
-#  FIXME we also got https://github.com/NVIDIA/nccl/issues/494
+   run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
     -DENABLE_CUDA=ON \
     -DTARGET=NVIDIA \
     -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
     -DCUDA_ARCH=$NV_ARCH"
  else
    echo "Skipping RAJA models due to CMake version requirement"
  fi
-#  run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
+  if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
-#  -DENABLE_CUDA=ON \
+    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
-#  -DTARGET=NVIDIA \
+#    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME
-#  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
+    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
 #  -DCUDA_ARCH=$NV_ARCH"
  # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
  local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
  local required="3.15.0"
  if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
    run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
    # FIXME CUDA Thrust + TBB throws the following error:
    #    /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
@ -198,7 +212,7 @@ build_gcc() {
    #    run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
  else
-    echo "CMake version ${current} < ${required}, skipping Thrust models"
+    echo "Skipping Thrust models due to CMake version requirement"
  fi
 }
@ -216,30 +230,39 @@ build_clang() {
    run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
  fi
-  run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
+  if check_cmake_ver "3.20.0"; then
    run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
  else
    echo "Skipping RAJA models due to CMake version requirement"
  fi
  run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
  run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
  run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
-  run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
+  if check_cmake_ver "3.16.0"; then
    run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  else
    echo "Skipping Kokkos models due to CMake version requirement"
  fi
  run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  for use_onedpl in OFF OPENMP TBB; do
-    for use_vector in OFF ON; do
+    case "$use_onedpl" in
-      case "$use_onedpl" in
+      OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
-        OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
+      *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0"  ;;
-        *)   dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0"  ;;
+    esac
-      esac
+    run_build $name "${CLANG_CXX:?}" std-data     "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-      run_build $name "${CLANG_CXX:?}" std-data     "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector "
+    run_build $name "${CLANG_CXX:?}" std-indices  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-      run_build $name "${CLANG_CXX:?}" std-indices  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector"
+    # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
      # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported
    done
  done
  run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
  run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
  run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors
-
+  if check_cmake_ver "3.20.0"; then
-  run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
+    run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
  else
    echo "Skipping RAJA models due to CMake version requirement"
  fi
  # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
@ -249,10 +272,6 @@ build_nvhpc() {
  run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
  run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
  # std again but with vectors
  run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
  run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON"
  run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
  run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
 }
@ -291,15 +310,18 @@ build_icpc() {
  local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
  run_build $name "${ICPC_CXX:?}" omp "$cxx"
  run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
-  run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
+  if check_cmake_ver "3.20.0"; then
-  run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
+    run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON"
-}
+  else
    echo "Skipping RAJA models due to CMake version requirement"
  fi
  if check_cmake_ver "3.16.0"; then
    run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  else
    echo "Skipping Kokkos models due to CMake version requirement"
  fi
 build_computecpp() {
  run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
  -DSYCL_COMPILER=COMPUTECPP \
  -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
  -DOpenCL_LIBRARY=${OCL_LIB:?}"
 }
 build_dpcpp() {
--- a/src/raja/model.cmake
+++ b/src/raja/model.cmake
@ -8,8 +8,6 @@ register_flag_optional(RAJA_IN_TREE
         Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
         Remember to append RAJA specific flags as well, for example:
             -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
         For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options:
             -DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ...
         See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
 " "")
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@ -6,22 +6,10 @@
 #include "STDDataStream.h"
 #ifdef USE_VECTOR
 #define BEGIN(x) (x).begin()
 #define END(x) (x).end()
 #else
 #define BEGIN(x) (x)
 #define END(x) ((x) + array_size)
 #endif
 template <class T>
 STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
  noexcept : array_size{ARRAY_SIZE},
 #ifdef USE_VECTOR
  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 #else
  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
 {
    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@ -41,55 +29,53 @@ STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
 template<class T>
 STDDataStream<T>::~STDDataStream() {
-#ifndef USE_VECTOR
+  dealloc_raw(a);
-    dealloc_raw(a);
+  dealloc_raw(b);
-    dealloc_raw(b);
+  dealloc_raw(c);
    dealloc_raw(c);
 #endif
 }
 template <class T>
 void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::fill(exe_policy, BEGIN(a), END(a), initA);
+  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, BEGIN(b), END(b), initB);
+  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, BEGIN(c), END(c), initC);
+  std::fill(exe_policy, c, c + array_size, initC);
 }
 template <class T>
 void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  std::copy(BEGIN(a), END(a), h_a.begin());
+  std::copy(a, a + array_size, h_a.begin());
-  std::copy(BEGIN(b), END(b), h_b.begin());
+  std::copy(b, b + array_size, h_b.begin());
-  std::copy(BEGIN(c), END(c), h_c.begin());
+  std::copy(c, c + array_size, h_c.begin());
 }
 template <class T>
 void STDDataStream<T>::copy()
 {
  // c[i] = a[i]
-  std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
+  std::copy(exe_policy, a, a + array_size, c);
 }
 template <class T>
 void STDDataStream<T>::mul()
 {
  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; });
+  std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
 }
 template <class T>
 void STDDataStream<T>::add()
 {
  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus<T>());
+  std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
 }
 template <class T>
 void STDDataStream<T>::triad()
 {
  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
+  std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
 }
 template <class T>
@ -99,8 +85,8 @@ void STDDataStream<T>::nstream()
  //  Need to do in two stages with C++11 STL.
  //  1: a[i] += b[i]
  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; });
+  std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
-  std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
+  std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
 }
@ -108,7 +94,7 @@ template <class T>
 T STDDataStream<T>::dot()
 {
  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
+  return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
 }
 void listDevices(void)
@ -127,6 +113,3 @@ std::string getDeviceDriver(const int)
 }
 template class STDDataStream<float>;
 template class STDDataStream<double>;
 #undef BEGIN
 #undef END
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@ -22,12 +22,7 @@ class STDDataStream : public Stream<T>
    int array_size;
    // Device side pointers
 #ifdef USE_VECTOR
    std::vector<T> a, b, c;
 #else
    T *a, *b, *c;
 #endif
  public:
    STDDataStream(const int, int) noexcept;
--- a/src/std-data/model.cmake
+++ b/src/std-data/model.cmake
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")
 register_flag_optional(USE_VECTOR
        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
        "OFF")
 register_flag_optional(NVHPC_OFFLOAD
        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -47,9 +43,6 @@ macro(setup)
        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
        register_append_link_flags(${NVHPC_FLAGS})
    endif ()
    if (USE_VECTOR)
        register_definitions(USE_VECTOR)
    endif ()
    if (USE_TBB)
        register_link_library(TBB::tbb)
    endif ()
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@ -10,32 +10,10 @@
 #define ALIGNMENT (2*1024*1024) // 2MB
 #endif
 #ifdef USE_VECTOR
 #define BEGIN(x) (x).begin()
 #define END(x) (x).end()
 #else
 #define BEGIN(x) (x)
 #define END(x) ((x) + array_size)
 #endif
 #ifdef USE_VECTOR
 #if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__))
 #error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures."
 #endif
 #if defined(USE_ONEDPL)
 #error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures"
 #endif
 #endif
 template <class T>
 STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
 noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
 #ifdef USE_VECTOR
  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 #else
  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
 {
    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@ -55,41 +33,39 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
 template<class T>
 STDIndicesStream<T>::~STDIndicesStream() {
-#ifndef USE_VECTOR
+  dealloc_raw(a);
-    dealloc_raw(a);
+  dealloc_raw(b);
-    dealloc_raw(b);
+  dealloc_raw(c);
    dealloc_raw(c);
 #endif
 }
 template <class T>
 void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::fill(exe_policy, BEGIN(a), END(a), initA);
+  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, BEGIN(b), END(b), initB);
+  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, BEGIN(c), END(c), initC);
+  std::fill(exe_policy, c, c + array_size, initC);
 }
 template <class T>
 void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  std::copy(BEGIN(a), END(a), h_a.begin());
+  std::copy(a, a + array_size, h_a.begin());
-  std::copy(BEGIN(b), END(b), h_b.begin());
+  std::copy(b, b + array_size, h_b.begin());
-  std::copy(BEGIN(c), END(c), h_c.begin());
+  std::copy(c, c + array_size, h_c.begin());
 }
 template <class T>
 void STDIndicesStream<T>::copy()
 {
  // c[i] = a[i]
-  std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c));
+  std::copy(exe_policy, a, a + array_size, c);
 }
 template <class T>
 void STDIndicesStream<T>::mul()
 {
  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) {
    return scalar * c[i];
  });
 }
@ -98,7 +74,7 @@ template <class T>
 void STDIndicesStream<T>::add()
 {
  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) {
    return a[i] + b[i];
  });
 }
@ -107,7 +83,7 @@ template <class T>
 void STDIndicesStream<T>::triad()
 {
  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) {
    return b[i] + scalar * c[i];
  });
 }
@ -119,7 +95,7 @@ void STDIndicesStream<T>::nstream()
  //  Need to do in two stages with C++11 STL.
  //  1: a[i] += b[i]
  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
+  std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) {
    return a[i] + b[i] + scalar * c[i];
  });
 }
@ -129,7 +105,7 @@ template <class T>
 T STDIndicesStream<T>::dot()
 {
  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0);
+  return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0);
 }
 void listDevices(void)
@ -148,6 +124,3 @@ std::string getDeviceDriver(const int)
 }
 template class STDIndicesStream<float>;
 template class STDIndicesStream<double>;
 #undef BEGIN
 #undef END
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@ -77,12 +77,7 @@ class STDIndicesStream : public Stream<T>
    ranged<int> range;
    // Device side pointers
 #ifdef USE_VECTOR
    std::vector<T> a, b, c;
 #else
    T *a, *b, *c;
 #endif
  public:
    STDIndicesStream(const int, int) noexcept;
--- a/src/std-indices/model.cmake
+++ b/src/std-indices/model.cmake
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")
 register_flag_optional(USE_VECTOR
        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
        "OFF")
 register_flag_optional(NVHPC_OFFLOAD
        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
@ -47,9 +43,6 @@ macro(setup)
        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
        register_append_link_flags(${NVHPC_FLAGS})
    endif ()
    if (USE_VECTOR)
        register_definitions(USE_VECTOR)
    endif ()
    if (USE_TBB)
        register_link_library(TBB::tbb)
    endif ()
--- a/src/std-ranges/STDRangesStream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@ -5,27 +5,16 @@
 // source code
 #include "STDRangesStream.hpp"
 #include <ranges>
 #ifndef ALIGNMENT
 #define ALIGNMENT (2*1024*1024) // 2MB
 #endif
 #ifdef USE_VECTOR
 #define BEGIN(x) (x).begin()
 #define END(x) (x).end()
 #else
 #define BEGIN(x) (x)
 #define END(x) ((x) + array_size)
 #endif
 template <class T>
 STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
 noexcept : array_size{ARRAY_SIZE},
 #ifdef USE_VECTOR
  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 #else
  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
 {
    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@ -45,11 +34,9 @@ noexcept : array_size{ARRAY_SIZE},
 template<class T>
 STDRangesStream<T>::~STDRangesStream() {
-#ifndef USE_VECTOR
+  dealloc_raw(a);
-    dealloc_raw(a);
+  dealloc_raw(b);
-    dealloc_raw(b);
+  dealloc_raw(c);
    dealloc_raw(c);
 #endif
 }
 template <class T>
@ -70,9 +57,9 @@ template <class T>
 void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  // Element-wise copy.
-    std::copy(BEGIN(a), END(a), h_a.begin());
+    std::copy(a, a + array_size, h_a.begin());
-    std::copy(BEGIN(b), END(b), h_b.begin());
+    std::copy(b, b + array_size, h_b.begin());
-    std::copy(BEGIN(c), END(c), h_c.begin());
+    std::copy(c, c + array_size, h_c.begin());
 }
 template <class T>
@ -148,7 +135,7 @@ T STDRangesStream<T>::dot()
  return
    std::transform_reduce(
      exe_policy,
-      BEGIN(a), END(a), BEGIN(b), 0.0);
+      a, a + array_size, b, 0.0);
 }
 void listDevices(void)
@ -168,6 +155,3 @@ std::string getDeviceDriver(const int)
 template class STDRangesStream<float>;
 template class STDRangesStream<double>;
 #undef BEGIN
 #undef END
--- a/src/std-ranges/STDRangesStream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@ -21,11 +21,7 @@ class STDRangesStream : public Stream<T>
    int array_size;
    // Device side pointers
 #ifdef USE_VECTOR
    std::vector<T> a, b, c;
 #else
    T *a, *b, *c;
 #endif
  public:
    STDRangesStream(const int, int) noexcept;
--- a/src/std-ranges/model.cmake
+++ b/src/std-ranges/model.cmake
@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
        "c++")
 register_flag_optional(USE_VECTOR
        "Whether to use std::vector<T> for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use."
        "OFF")
 register_flag_optional(USE_TBB
        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
        "OFF")
@ -32,10 +28,7 @@ macro(setup)
    set(CMAKE_CXX_STANDARD_REQUIRED OFF)
    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
    # and append our own:
-    register_append_cxx_flags(ANY -std=c++2a)
+    register_append_cxx_flags(ANY -std=c++20)
    if (USE_VECTOR)
        register_definitions(USE_VECTOR)
    endif ()
    if (USE_TBB)
        register_link_library(TBB::tbb)
    endif ()
@ -44,3 +37,10 @@ macro(setup)
        register_link_library(oneDPL)
    endif ()
 endmacro()
 macro(setup_target NAME)
    if (USE_ONEDPL)
        target_compile_features(${NAME} INTERFACE cxx_std_20)
        target_compile_features(oneDPL INTERFACE cxx_std_20)
    endif ()
 endmacro()
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@ -46,11 +46,12 @@ macro(setup)
        # see CUDA.cmake, we're only adding a few Thrust related libraries here
        if (POLICY CMP0104)
-            cmake_policy(SET CMP0104 OLD)
+            cmake_policy(SET CMP0104 NEW)
        endif ()
        set(CMAKE_CUDA_ARCHITECTURES  ${CUDA_ARCH})
        # add -forward-unknown-to-host-compiler for compatibility reasons
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
        enable_language(CUDA)
        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
        # appended later
@ -63,6 +64,7 @@ macro(setup)
        # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
        # same thing for thrust
        if (SDK_DIR)
            list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
            find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
            find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
        else ()
@ -73,9 +75,11 @@ macro(setup)
        message(STATUS "Using Thrust backend: ${BACKEND}")
        # this creates the interface that we can link to
-        thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND})
+        thrust_create_target(Thrust${BACKEND}
                HOST CPP
                DEVICE ${BACKEND})
-        register_link_library(Thrust)
+        register_link_library(Thrust${BACKEND})
    elseif (${THRUST_IMPL} STREQUAL "ROCM")
        if (SDK_DIR)
            find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)