Merge pull request #91 from UoB-HPC/cmake

CMake+CI integration
2021-04-08 16:18:38 +01:00 · 2021-04-08 16:18:38 +01:00 · 9642821afa
commit 9642821afa
parent 3c637cd04d f279f58a72
46 changed files with 10040 additions and 32 deletions
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@ -0,0 +1,98 @@
 name: CI
 on: [push, pull_request]
 jobs:
  test:
    runs-on: ubuntu-18.04
    steps:
      - uses: actions/checkout@v2
      - name: Cache compiler
        id: prepare-compilers
        uses: actions/cache@v2
        with:
          path: compilers
          key: ${{ runner.os }}-${{ hashFiles('ci-prepare-bionic.sh') }}
      - name: Prepare compilers
        if: steps.prepare-compilers.outputs.cache-hit != 'true'
        run: source ./ci-prepare-bionic.sh ./compilers SETUP true || true
      - name: Setup test environment
        run: source ./ci-prepare-bionic.sh ./compilers VARS false || true
      - name: Test compile gcc     @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile clang   @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile nvhpc   @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile aocc    @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile aomp    @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile hip     @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile dpcpp   @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile hipsycl @ CMake 3.13
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_13_BIN }}
      - name: Test compile gcc     @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile clang   @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile nvhpc   @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile aocc    @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile aomp    @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile hip     @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile dpcpp   @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile hipsycl @ CMake 3.15
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_15_BIN }}
      - name: Test compile gcc     @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile clang   @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile nvhpc   @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile aocc    @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile aomp    @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile hip     @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile dpcpp   @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
      - name: Test compile hipsycl @ CMake 3.18
        if: ${{ ! cancelled() }}
        run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
--- a/.gitignore
+++ b/.gitignore
@ -21,3 +21,10 @@ KokkosCore_config.*
 .DS_Store
 Makefile
 build/
 cmake-build-*/
 CMakeFiles/
 .idea/
 .vscode/
 .directory
--- a/ACC.cmake
+++ b/ACC.cmake
@ -0,0 +1,78 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that supports OpenACC as per CMake detection"
        "c++")
 register_flag_optional(TARGET_DEVICE
        "[PGI/NVHPC only] This sets the `-target` flag, possible values are:
             gpu       - Globally set the target device to an NVIDIA GPU
             multicore - Globally set the target device to the host CPU
         Refer to `nvc++ --help` for the full list"
        "")
 register_flag_optional(CUDA_ARCH
        "[PGI/NVHPC only] Only applicable if `TARGET_DEVICE` is set to `gpu`.
         Nvidia architecture in ccXY format, for example, sm_70 becomes cc70, will be passed in via `-gpu=` (e.g `cc70`)
         Possible values are:
             cc35  - Compile for compute capability 3.5
             cc50  - Compile for compute capability 5.0
             cc60  - Compile for compute capability 6.0
             cc62  - Compile for compute capability 6.2
             cc70  - Compile for compute capability 7.0
             cc72  - Compile for compute capability 7.2
             cc75  - Compile for compute capability 7.5
             cc80  - Compile for compute capability 8.0
             ccall - Compile for all supported compute capabilities
         Refer to `nvc++ --help` for the full list"
        "")
 register_flag_optional(TARGET_PROCESSOR
        "[PGI/NVHPC only] This sets the `-tp` (target processor) flag, possible values are:
             px          - Generic x86 Processor
             bulldozer   - AMD Bulldozer processor
             piledriver  - AMD Piledriver processor
             zen         - AMD Zen architecture (Epyc, Ryzen)
             zen2        - AMD Zen 2 architecture (Ryzen 2)
             sandybridge - Intel SandyBridge processor
             haswell     - Intel Haswell processor
             knl         - Intel Knights Landing processor
             skylake     - Intel Skylake Xeon processor
             host        - Link native version of HPC SDK cpu math library
             native      - Alias for -tp host
        Refer to `nvc++ --help` for the full list"
        "")
 macro(setup)
    find_package(OpenACC REQUIRED)
    if(${CMAKE_VERSION} VERSION_LESS "3.16.0")
        # CMake didn't really implement ACC as a target before 3.16, so we append them manually
        separate_arguments(OpenACC_CXX_FLAGS)
        register_append_cxx_flags(ANY ${OpenACC_CXX_FLAGS})
        register_append_link_flags(${OpenACC_CXX_FLAGS})
    else()
        register_link_library(OpenACC::OpenACC_CXX)
    endif()
    register_definitions(restrict=__restrict)
    # XXX NVHPC is really new so older Cmake thinks it's PGI, which is true
    if ((CMAKE_CXX_COMPILER_ID STREQUAL PGI) OR (CMAKE_CXX_COMPILER_ID STREQUAL NVHPC))
        if (TARGET_DEVICE)
            register_append_cxx_flags(ANY -target=${TARGET_DEVICE})
        endif ()
        if (CUDA_ARCH)
            register_append_cxx_flags(ANY -gpu=${CUDA_ARCH})
        endif ()
        if (TARGET_PROCESSOR)
            register_append_cxx_flags(ANY -tp=${TARGET_PROCESSOR})
        endif ()
    endif ()
 endmacro()
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@ -16,9 +16,14 @@ ACCStream<T>::ACCStream(const int ARRAY_SIZE, int device)
  array_size = ARRAY_SIZE;
  // Set up data region on device
-  a = new T[array_size];
+  this->a = new T[array_size];
-  b = new T[array_size];
+  this->b = new T[array_size];
-  c = new T[array_size];
+  this->c = new T[array_size];
  T * restrict a = this->a;
  T * restrict b = this->b;
  T * restrict c = this->c;
  #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
@ -28,6 +33,11 @@ ACCStream<T>::~ACCStream()
 {
  // End data region on device
  int array_size = this->array_size;
  T * restrict a = this->a;
  T * restrict b = this->b;
  T * restrict c = this->c;
  #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
--- a/ACCStream.h
+++ b/ACCStream.h
@ -19,9 +19,17 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
 	struct A{
 		T *a;
 		T *b;
 		T *c;
 	};
  protected:
    // Size of arrays
    int array_size;
    A aa;
    // Device side pointers
    T *a;
    T *b;
--- a/CL/cl.h
+++ b/CL/cl.h
--- a/CL/cl_d3d10.h
+++ b/CL/cl_d3d10.h
@ -0,0 +1,117 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_D3D10_H
 #define __OPENCL_CL_D3D10_H
 #include <d3d10.h>
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************
 * cl_khr_d3d10_sharing                                                       */
 #define cl_khr_d3d10_sharing 1
 typedef cl_uint cl_d3d10_device_source_khr;
 typedef cl_uint cl_d3d10_device_set_khr;
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 #define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 /* cl_d3d10_device_source_nv */
 #define CL_D3D10_DEVICE_KHR                          0x4010
 #define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 /* cl_d3d10_device_set_nv */
 #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 #define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 /* cl_context_info */
 #define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 /* cl_mem_info */
 #define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 /* cl_image_info */
 #define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
    cl_platform_id             platform,
    cl_d3d10_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d10_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D10Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_D3D10_H */
--- a/CL/cl_d3d11.h
+++ b/CL/cl_d3d11.h
@ -0,0 +1,117 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_D3D11_H
 #define __OPENCL_CL_D3D11_H
 #include <d3d11.h>
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************
 * cl_khr_d3d11_sharing                                                       */
 #define cl_khr_d3d11_sharing 1
 typedef cl_uint cl_d3d11_device_source_khr;
 typedef cl_uint cl_d3d11_device_set_khr;
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_D3D11_DEVICE_KHR                  -1006
 #define CL_INVALID_D3D11_RESOURCE_KHR                -1007
 #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
 #define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
 /* cl_d3d11_device_source */
 #define CL_D3D11_DEVICE_KHR                          0x4019
 #define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
 /* cl_d3d11_device_set */
 #define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
 #define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
 /* cl_context_info */
 #define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
 #define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
 /* cl_mem_info */
 #define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
 /* cl_image_info */
 #define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
 #define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
    cl_platform_id             platform,
    cl_d3d11_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d11_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D11Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_D3D11_H */
--- a/CL/cl_dx9_media_sharing.h
+++ b/CL/cl_dx9_media_sharing.h
@ -0,0 +1,118 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
 #define __OPENCL_CL_DX9_MEDIA_SHARING_H
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************/
 /* cl_khr_dx9_media_sharing                                                   */
 #define cl_khr_dx9_media_sharing 1
 typedef cl_uint             cl_dx9_media_adapter_type_khr;
 typedef cl_uint             cl_dx9_media_adapter_set_khr;
 #if defined(_WIN32)
 #include <d3d9.h>
 typedef struct _cl_dx9_surface_info_khr
 {
    IDirect3DSurface9 *resource;
    HANDLE shared_handle;
 } cl_dx9_surface_info_khr;
 #endif
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
 #define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
 #define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
 #define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
 /* cl_media_adapter_type_khr */
 #define CL_ADAPTER_D3D9_KHR                              0x2020
 #define CL_ADAPTER_D3D9EX_KHR                            0x2021
 #define CL_ADAPTER_DXVA_KHR                              0x2022
 /* cl_media_adapter_set_khr */
 #define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
 #define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
 /* cl_context_info */
 #define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
 #define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
 #define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
 /* cl_mem_info */
 #define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
 #define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
 /* cl_image_info */
 #define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
 #define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
    cl_platform_id                   platform,
    cl_uint                          num_media_adapters,
    cl_dx9_media_adapter_type_khr *  media_adapter_type,
    void *                           media_adapters,
    cl_dx9_media_adapter_set_khr     media_adapter_set,
    cl_uint                          num_entries,
    cl_device_id *                   devices,
    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
    cl_context                    context,
    cl_mem_flags                  flags,
    cl_dx9_media_adapter_type_khr adapter_type,
    void *                        surface_info,
    cl_uint                       plane,                                                                          
    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
--- a/CL/cl_dx9_media_sharing_intel.h
+++ b/CL/cl_dx9_media_sharing_intel.h
@ -0,0 +1,170 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_dx9_media_sharing_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 #define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #include <d3d9.h>
 #include <dxvahd.h>
 #include <wtypes.h>
 #include <d3d9types.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /***************************************
 * cl_intel_dx9_media_sharing extension *
 ****************************************/
 #define cl_intel_dx9_media_sharing 1
 typedef cl_uint cl_dx9_device_source_intel;
 typedef cl_uint cl_dx9_device_set_intel;
 /* error codes */
 #define CL_INVALID_DX9_DEVICE_INTEL                   -1010
 #define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
 #define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
 #define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
 /* cl_dx9_device_source_intel */
 #define CL_D3D9_DEVICE_INTEL                          0x4022
 #define CL_D3D9EX_DEVICE_INTEL                        0x4070
 #define CL_DXVA_DEVICE_INTEL                          0x4071
 /* cl_dx9_device_set_intel */
 #define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
 #define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
 /* cl_context_info */
 #define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
 #define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
 #define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
 /* cl_mem_info */
 #define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
 #define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
 /* cl_image_info */
 #define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
 #define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
 /******************************************************************************/
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromDX9INTEL(
    cl_platform_id              platform,
    cl_dx9_device_source_intel  dx9_device_source,
    void*                       dx9_object,
    cl_dx9_device_set_intel     dx9_device_set,
    cl_uint                     num_entries,
    cl_device_id*               devices,
    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
    cl_platform_id              platform,
    cl_dx9_device_source_intel  dx9_device_source,
    void*                       dx9_object,
    cl_dx9_device_set_intel     dx9_device_set,
    cl_uint                     num_entries,
    cl_device_id*               devices,
    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromDX9MediaSurfaceINTEL(
    cl_context                  context,
    cl_mem_flags                flags,
    IDirect3DSurface9*          resource,
    HANDLE                      sharedHandle,
    UINT                        plane,
    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
    cl_context                  context,
    cl_mem_flags                flags,
    IDirect3DSurface9*          resource,
    HANDLE                      sharedHandle,
    UINT                        plane,
    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireDX9ObjectsINTEL(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    const cl_mem*               mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    const cl_mem*               mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseDX9ObjectsINTEL(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    cl_mem*                     mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    cl_mem*                     mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
--- a/CL/cl_egl.h
+++ b/CL/cl_egl.h
@ -0,0 +1,120 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_EGL_H
 #define __OPENCL_CL_EGL_H
 #include <CL/cl.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
 #define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
 #define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
 #define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
 /* Error type for clCreateFromEGLImageKHR */
 #define CL_INVALID_EGL_OBJECT_KHR             -1093
 #define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
 /* CLeglImageKHR is an opaque handle to an EGLImage */
 typedef void* CLeglImageKHR;
 /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
 typedef void* CLeglDisplayKHR;
 /* CLeglSyncKHR is an opaque handle to an EGLSync object */
 typedef void* CLeglSyncKHR;
 /* properties passed to clCreateFromEGLImageKHR */
 typedef intptr_t cl_egl_image_properties_khr;
 #define cl_khr_egl_image 1
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromEGLImageKHR(cl_context                  context,
                        CLeglDisplayKHR             egldisplay,
                        CLeglImageKHR               eglimage,
                        cl_mem_flags                flags,
                        const cl_egl_image_properties_khr * properties,
                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
    cl_context                  context,
    CLeglDisplayKHR             egldisplay,
    CLeglImageKHR               eglimage,
    cl_mem_flags                flags,
    const cl_egl_image_properties_khr * properties,
    cl_int *                    errcode_ret);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
                              cl_uint          num_objects,
                              const cl_mem *   mem_objects,
                              cl_uint          num_events_in_wait_list,
                              const cl_event * event_wait_list,
                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
                              cl_uint          num_objects,
                              const cl_mem *   mem_objects,
                              cl_uint          num_events_in_wait_list,
                              const cl_event * event_wait_list,
                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event);
 #define cl_khr_egl_event 1
 extern CL_API_ENTRY cl_event CL_API_CALL
 clCreateEventFromEGLSyncKHR(cl_context      context,
                            CLeglSyncKHR    sync,
                            CLeglDisplayKHR display,
                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
    cl_context      context,
    CLeglSyncKHR    sync,
    CLeglDisplayKHR display,
    cl_int *        errcode_ret);
 #ifdef __cplusplus
 }
 #endif
 #endif /* __OPENCL_CL_EGL_H */
--- a/CL/cl_ext.h
+++ b/CL/cl_ext.h
@ -0,0 +1,841 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 /* cl_ext.h contains OpenCL extensions which don't have external */
 /* (OpenGL, D3D) dependencies.                                   */
 #ifndef __CL_EXT_H
 #define __CL_EXT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <CL/cl.h>
 /* cl_khr_fp64 extension - no extension #define since it has no functions  */
 /* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
 #if CL_TARGET_OPENCL_VERSION <= 110
 #define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
 #endif
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 /* Memory object destruction
 *
 * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
 *
 * Registers a user callback function that will be called when the memory object is deleted and its resources
 * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
 * stack associated with memobj. The registered user callback functions are called in the reverse order in
 * which they were registered. The user callback functions are called and then the memory object is deleted
 * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
 * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
 * the storage bits for the memory object, can be reused or freed.
 *
 * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
 *
 * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 */
 #define cl_APPLE_SetMemObjectDestructor 1
 cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;
 /* Context Logging Functions
 *
 * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 *
 * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
 */
 #define cl_APPLE_ContextLoggingFunctions 1
 extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
                                            const void * private_info,
                                            size_t       cb,
                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
 extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
                                          const void * private_info,
                                          size_t       cb,
                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
 extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
                                          const void * private_info,
                                          size_t       cb,
                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
 /************************
 * cl_khr_icd extension *
 ************************/
 #define cl_khr_icd 1
 /* cl_platform_info                                                        */
 #define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
 /* Additional Error Codes                                                  */
 #define CL_PLATFORM_NOT_FOUND_KHR                   -1001
 extern CL_API_ENTRY cl_int CL_API_CALL
 clIcdGetPlatformIDsKHR(cl_uint          num_entries,
                       cl_platform_id * platforms,
                       cl_uint *        num_platforms);
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
                                         cl_platform_id * platforms,
                                         cl_uint *        num_platforms);
 /*******************************
 * cl_khr_il_program extension *
 *******************************/
 #define cl_khr_il_program 1
 /* New property to clGetDeviceInfo for retrieving supported intermediate
 * languages
 */
 #define CL_DEVICE_IL_VERSION_KHR                    0x105B
 /* New property to clGetProgramInfo for retrieving for retrieving the IL of a
 * program
 */
 #define CL_PROGRAM_IL_KHR                           0x1169
 extern CL_API_ENTRY cl_program CL_API_CALL
 clCreateProgramWithILKHR(cl_context   context,
                         const void * il,
                         size_t       length,
                         cl_int *     errcode_ret);
 typedef CL_API_ENTRY cl_program
 (CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
                                           const void * il,
                                           size_t       length,
                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 /* Extension: cl_khr_image2d_from_buffer
 *
 * This extension allows a 2D image to be created from a cl_mem buffer without
 * a copy. The type associated with a 2D image created from a buffer in an
 * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
 * built-in functions are supported for 2D images and 2D images created from
 * a buffer.  Similarly, the write_image built-ins are also supported for 2D
 * images created from a buffer.
 *
 * When the 2D image from buffer is created, the client must specify the
 * width, height, image format (i.e. channel order and channel data type)
 * and optionally the row pitch.
 *
 * The pitch specified must be a multiple of
 * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
 * The base address of the buffer must be aligned to
 * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
 */
 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B
 /**************************************
 * cl_khr_initialize_memory extension *
 **************************************/
 #define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
 /**************************************
 * cl_khr_terminate_context extension *
 **************************************/
 #define CL_CONTEXT_TERMINATED_KHR                   -1121
 #define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
 #define CL_CONTEXT_TERMINATE_KHR                    0x2032
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL
 clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
 /*
 * Extension: cl_khr_spir
 *
 * This extension adds support to create an OpenCL program object from a
 * Standard Portable Intermediate Representation (SPIR) instance
 */
 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
 #define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
 /*****************************************
 * cl_khr_create_command_queue extension *
 *****************************************/
 #define cl_khr_create_command_queue 1
 typedef cl_bitfield cl_queue_properties_khr;
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
 clCreateCommandQueueWithPropertiesKHR(cl_context context,
                                      cl_device_id device,
                                      const cl_queue_properties_khr* properties,
                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_command_queue
 (CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
                                                        cl_device_id device,
                                                        const cl_queue_properties_khr* properties,
                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 /******************************************
 * cl_nv_device_attribute_query extension *
 ******************************************/
 /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
 #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
 #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
 #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
 #define CL_DEVICE_WARP_SIZE_NV                      0x4003
 #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD            0x4036
 #define CL_DEVICE_TOPOLOGY_AMD                          0x4037
 #define CL_DEVICE_BOARD_NAME_AMD                        0x4038
 #define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                0x4039
 #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD             0x4040
 #define CL_DEVICE_SIMD_WIDTH_AMD                        0x4041
 #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD            0x4042
 #define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043
 #define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD               0x4044
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD          0x4045
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD     0x4046
 #define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
 #define CL_DEVICE_LOCAL_MEM_BANKS_AMD                   0x4048
 #define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD            0x4049
 #define CL_DEVICE_GFXIP_MAJOR_AMD                       0x404A
 #define CL_DEVICE_GFXIP_MINOR_AMD                       0x404B
 #define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD            0x404C
 #define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
 #define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
 #define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
 #define CL_DEVICE_PCIE_ID_AMD                           0x4034
 /*********************************
 * cl_arm_printf extension
 *********************************/
 #define CL_PRINTF_CALLBACK_ARM                      0x40B0
 #define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 /***********************************
 * cl_ext_device_fission extension
 ***********************************/
 #define cl_ext_device_fission   1
 extern CL_API_ENTRY cl_int CL_API_CALL
 clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
 typedef cl_ulong  cl_device_partition_property_ext;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clCreateSubDevicesEXT(cl_device_id   in_device,
                      const cl_device_partition_property_ext * properties,
                      cl_uint        num_entries,
                      cl_device_id * out_devices,
                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
                                         const cl_device_partition_property_ext * properties,
                                         cl_uint        num_entries,
                                         cl_device_id * out_devices,
                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
 /* cl_device_partition_property_ext */
 #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
 #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
 #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
 /* clDeviceGetInfo selectors */
 #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
 #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
 #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
 #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
 #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
 /* error codes */
 #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
 #define CL_INVALID_PARTITION_COUNT_EXT              -1058
 #define CL_INVALID_PARTITION_NAME_EXT               -1059
 /* CL_AFFINITY_DOMAINs */
 #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
 #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
 #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
 #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
 #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
 #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
 /* cl_device_partition_property_ext list terminators */
 #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
 #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
 #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
 /***********************************
 * cl_ext_migrate_memobject extension definitions
 ***********************************/
 #define cl_ext_migrate_memobject 1
 typedef cl_bitfield cl_mem_migration_flags_ext;
 #define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
 #define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
                             cl_uint          num_mem_objects,
                             const cl_mem *   mem_objects,
                             cl_mem_migration_flags_ext flags,
                             cl_uint          num_events_in_wait_list,
                             const cl_event * event_wait_list,
                             cl_event *       event);
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
                                               cl_uint          num_mem_objects,
                                               const cl_mem *   mem_objects,
                                               cl_mem_migration_flags_ext flags,
                                               cl_uint          num_events_in_wait_list,
                                               const cl_event * event_wait_list,
                                               cl_event *       event);
 /*********************************
 * cl_qcom_ext_host_ptr extension
 *********************************/
 #define cl_qcom_ext_host_ptr 1
 #define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 #define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
 #define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
 #define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
 #define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
 #define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
 #define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
 #define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
 #define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
 typedef cl_uint                                   cl_image_pitch_info_qcom;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceImageInfoQCOM(cl_device_id             device,
                         size_t                   image_width,
                         size_t                   image_height,
                         const cl_image_format   *image_format,
                         cl_image_pitch_info_qcom param_name,
                         size_t                   param_value_size,
                         void                    *param_value,
                         size_t                  *param_value_size_ret);
 typedef struct _cl_mem_ext_host_ptr
 {
    /* Type of external memory allocation. */
    /* Legal values will be defined in layered extensions. */
    cl_uint  allocation_type;
    /* Host cache policy for this external memory allocation. */
    cl_uint  host_cache_policy;
 } cl_mem_ext_host_ptr;
 /*******************************************
 * cl_qcom_ext_host_ptr_iocoherent extension
 ********************************************/
 /* Cache policy specifying io-coherence */
 #define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
 /*********************************
 * cl_qcom_ion_host_ptr extension
 *********************************/
 #define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
 typedef struct _cl_mem_ion_host_ptr
 {
    /* Type of external memory allocation. */
    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
    cl_mem_ext_host_ptr  ext_host_ptr;
    /* ION file descriptor */
    int                  ion_filedesc;
    /* Host pointer to the ION allocated memory */
    void*                ion_hostptr;
 } cl_mem_ion_host_ptr;
 /*********************************
 * cl_qcom_android_native_buffer_host_ptr extension
 *********************************/
 #define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
 typedef struct _cl_mem_android_native_buffer_host_ptr
 {
    /* Type of external memory allocation. */
    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
    cl_mem_ext_host_ptr  ext_host_ptr;
    /* Virtual pointer to the android native buffer */
    void*                anb_ptr;
 } cl_mem_android_native_buffer_host_ptr;
 /******************************************
 * cl_img_yuv_image extension *
 ******************************************/
 /* Image formats used in clCreateImage */
 #define CL_NV21_IMG                                 0x40D0
 #define CL_YV12_IMG                                 0x40D1
 /******************************************
 * cl_img_cached_allocations extension *
 ******************************************/
 /* Flag values used by clCreateBuffer */
 #define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
 #define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
 /******************************************
 * cl_img_use_gralloc_ptr extension *
 ******************************************/
 #define cl_img_use_gralloc_ptr 1
 /* Flag values used by clCreateBuffer */
 #define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
 /* To be used by clGetEventInfo: */
 #define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
 #define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
 /* Error code from clEnqueueReleaseGrallocObjectsIMG */
 #define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
                                  cl_uint               num_objects,
                                  const cl_mem *        mem_objects,
                                  cl_uint               num_events_in_wait_list,
                                  const cl_event *      event_wait_list,
                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
                                  cl_uint               num_objects,
                                  const cl_mem *        mem_objects,
                                  cl_uint               num_events_in_wait_list,
                                  const cl_event *      event_wait_list,
                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
 /*********************************
 * cl_khr_subgroups extension
 *********************************/
 #define cl_khr_subgroups 1
 #if !defined(CL_VERSION_2_1)
 /* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
   In hindsight, there should have been a khr suffix on this type for
   the extension, but keeping it un-suffixed to maintain backwards
   compatibility. */
 typedef cl_uint             cl_kernel_sub_group_info;
 #endif
 /* cl_kernel_sub_group_info */
 #define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
 #define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
                           cl_device_id in_device,
                           cl_kernel_sub_group_info param_name,
                           size_t       input_value_size,
                           const void * input_value,
                           size_t       param_value_size,
                           void *       param_value,
                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 typedef CL_API_ENTRY cl_int
 (CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
                                              cl_device_id in_device,
                                              cl_kernel_sub_group_info param_name,
                                              size_t       input_value_size,
                                              const void * input_value,
                                              size_t       param_value_size,
                                              void *       param_value,
                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 /*********************************
 * cl_khr_mipmap_image extension
 *********************************/
 /* cl_sampler_properties */
 #define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
 #define CL_SAMPLER_LOD_MIN_KHR                      0x1156
 #define CL_SAMPLER_LOD_MAX_KHR                      0x1157
 /*********************************
 * cl_khr_priority_hints extension
 *********************************/
 /* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
 #define cl_khr_priority_hints 1
 typedef cl_uint  cl_queue_priority_khr;
 /* cl_command_queue_properties */
 #define CL_QUEUE_PRIORITY_KHR 0x1096
 /* cl_queue_priority_khr */
 #define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
 #define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
 #define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
 /*********************************
 * cl_khr_throttle_hints extension
 *********************************/
 /* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
 #define cl_khr_throttle_hints 1
 typedef cl_uint  cl_queue_throttle_khr;
 /* cl_command_queue_properties */
 #define CL_QUEUE_THROTTLE_KHR 0x1097
 /* cl_queue_throttle_khr */
 #define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
 #define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
 #define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
 /*********************************
 * cl_khr_subgroup_named_barrier
 *********************************/
 /* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
 #define cl_khr_subgroup_named_barrier 1
 /* cl_device_info */
 #define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
 /*********************************
 * cl_khr_extended_versioning
 *********************************/
 #define cl_khr_extended_versioning 1
 #define CL_VERSION_MAJOR_BITS_KHR (10)
 #define CL_VERSION_MINOR_BITS_KHR (10)
 #define CL_VERSION_PATCH_BITS_KHR (12)
 #define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
 #define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
 #define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
 #define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
 #define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
 #define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
 #define CL_MAKE_VERSION_KHR(major, minor, patch) \
    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
    ((patch) & CL_VERSION_PATCH_MASK_KHR))
 typedef cl_uint cl_version_khr;
 #define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
 typedef struct _cl_name_version_khr
 {
    cl_version_khr version;
    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
 } cl_name_version_khr;
 /* cl_platform_info */
 #define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
 #define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907
 /* cl_device_info */
 #define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
 #define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
 #define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
 #define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
 #define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062
 /*********************************
 * cl_khr_device_uuid extension
 *********************************/
 #define cl_khr_device_uuid 1
 #define CL_UUID_SIZE_KHR 16
 #define CL_LUID_SIZE_KHR 8
 #define CL_DEVICE_UUID_KHR          0x106A
 #define CL_DRIVER_UUID_KHR          0x106B
 #define CL_DEVICE_LUID_VALID_KHR    0x106C
 #define CL_DEVICE_LUID_KHR          0x106D
 #define CL_DEVICE_NODE_MASK_KHR     0x106E
 /**********************************
 * cl_arm_import_memory extension *
 **********************************/
 #define cl_arm_import_memory 1
 typedef intptr_t cl_import_properties_arm;
 /* Default and valid proporties name for cl_arm_import_memory */
 #define CL_IMPORT_TYPE_ARM                        0x40B2
 /* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
 /* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
 /* Protected memory property */
 #define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
 /* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
 /* Data consistency with host property */
 #define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
 /* Import memory size value to indicate a size for the whole buffer */
 #define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
 /* This extension adds a new function that allows for direct memory import into
 * OpenCL via the clImportMemoryARM function.
 *
 * Memory imported through this interface will be mapped into the device's page
 * tables directly, providing zero copy access. It will never fall back to copy
 * operations and aliased buffers.
 *
 * Types of memory supported for import are specified as additional extension
 * strings.
 *
 * This extension produces cl_mem allocations which are compatible with all other
 * users of cl_mem in the standard API.
 *
 * This extension maps pages with the same properties as the normal buffer creation
 * function clCreateBuffer.
 */
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clImportMemoryARM( cl_context context,
                   cl_mem_flags flags,
                   const cl_import_properties_arm *properties,
                   void *memory,
                   size_t size,
                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
 /******************************************
 * cl_arm_shared_virtual_memory extension *
 ******************************************/
 #define cl_arm_shared_virtual_memory 1
 /* Used by clGetDeviceInfo */
 #define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
 /* Used by clGetMemObjectInfo */
 #define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
 /* Used by clSetKernelExecInfoARM: */
 #define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
 #define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
 /* To be used by clGetEventInfo: */
 #define CL_COMMAND_SVM_FREE_ARM                         0x40BA
 #define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
 #define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
 #define CL_COMMAND_SVM_MAP_ARM                          0x40BD
 #define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
 /* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
 #define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
 #define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
 #define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
 #define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
 /* Flag values used by clSVMAllocARM: */
 #define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
 #define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
 typedef cl_bitfield cl_svm_mem_flags_arm;
 typedef cl_uint     cl_kernel_exec_info_arm;
 typedef cl_bitfield cl_device_svm_capabilities_arm;
 extern CL_API_ENTRY void * CL_API_CALL
 clSVMAllocARM(cl_context       context,
              cl_svm_mem_flags_arm flags,
              size_t           size,
              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY void CL_API_CALL
 clSVMFreeARM(cl_context        context,
             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMFreeARM(cl_command_queue  command_queue,
                    cl_uint           num_svm_pointers,
                    void *            svm_pointers[],
                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
                                                       cl_uint          num_svm_pointers,
                                                       void *           svm_pointers[],
                                                       void *           user_data),
                    void *            user_data,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
                      cl_bool           blocking_copy,
                      void *            dst_ptr,
                      const void *      src_ptr,
                      size_t            size,
                      cl_uint           num_events_in_wait_list,
                      const cl_event *  event_wait_list,
                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
                       void *            svm_ptr,
                       const void *      pattern,
                       size_t            pattern_size,
                       size_t            size,
                       cl_uint           num_events_in_wait_list,
                       const cl_event *  event_wait_list,
                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMapARM(cl_command_queue  command_queue,
                   cl_bool           blocking_map,
                   cl_map_flags      flags,
                   void *            svm_ptr,
                   size_t            size,
                   cl_uint           num_events_in_wait_list,
                   const cl_event *  event_wait_list,
                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
                     void *            svm_ptr,
                     cl_uint           num_events_in_wait_list,
                     const cl_event *  event_wait_list,
                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelArgSVMPointerARM(cl_kernel    kernel,
                            cl_uint      arg_index,
                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelExecInfoARM(cl_kernel            kernel,
                       cl_kernel_exec_info_arm  param_name,
                       size_t               param_value_size,
                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;
 /********************************
 * cl_arm_get_core_id extension *
 ********************************/
 #ifdef CL_VERSION_1_2
 #define cl_arm_get_core_id 1
 /* Device info property for bitfield of cores present */
 #define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF
 #endif  /* CL_VERSION_1_2 */
 /*********************************
 * cl_arm_job_slot_selection
 *********************************/
 #define cl_arm_job_slot_selection 1
 /* cl_device_info */
 #define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0
 /* cl_command_queue_properties */
 #define CL_QUEUE_JOB_SLOT_ARM                     0x41E1
 #ifdef __cplusplus
 }
 #endif
 #endif /* __CL_EXT_H */
--- a/CL/cl_ext_intel.h
+++ b/CL/cl_ext_intel.h
@ -0,0 +1,682 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2020 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_ext_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __CL_EXT_INTEL_H
 #define __CL_EXT_INTEL_H
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /***************************************
 * cl_intel_thread_local_exec extension *
 ****************************************/
 #define cl_intel_thread_local_exec 1
 #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
 /***********************************************
 * cl_intel_device_partition_by_names extension *
 ************************************************/
 #define cl_intel_device_partition_by_names 1
 #define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
 #define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
 /************************************************
 * cl_intel_accelerator extension                *
 * cl_intel_motion_estimation extension          *
 * cl_intel_advanced_motion_estimation extension *
 *************************************************/
 #define cl_intel_accelerator 1
 #define cl_intel_motion_estimation 1
 #define cl_intel_advanced_motion_estimation 1
 typedef struct _cl_accelerator_intel* cl_accelerator_intel;
 typedef cl_uint cl_accelerator_type_intel;
 typedef cl_uint cl_accelerator_info_intel;
 typedef struct _cl_motion_estimation_desc_intel {
    cl_uint mb_block_type;
    cl_uint subpixel_mode;
    cl_uint sad_adjust_mode;
    cl_uint search_path_type;
 } cl_motion_estimation_desc_intel;
 /* error codes */
 #define CL_INVALID_ACCELERATOR_INTEL                              -1094
 #define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
 #define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
 #define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
 /* cl_accelerator_type_intel */
 #define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
 /* cl_accelerator_info_intel */
 #define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
 #define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
 #define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
 #define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
 /* cl_motion_detect_desc_intel flags */
 #define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
 #define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
 #define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
 #define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
 #define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
 #define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
 #define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
 #define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
 #define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
 #define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
 #define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
 #define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
 #define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
 #define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
 #define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
 #define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
 #define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
 #define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
 #define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
 #define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
 #define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
 #define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
 #define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
 #define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
 #define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
 #define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
 #define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
 #define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
 #define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
 #define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
 #define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
 #define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
 #define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
 #define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
 #define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
 #define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
 #define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
 /* cl_device_info */
 #define CL_DEVICE_ME_VERSION_INTEL                                0x407E
 #define CL_ME_VERSION_LEGACY_INTEL                                0x0
 #define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
 #define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
 extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
 clCreateAcceleratorINTEL(
    cl_context                   context,
    cl_accelerator_type_intel    accelerator_type,
    size_t                       descriptor_size,
    const void*                  descriptor,
    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
    cl_context                   context,
    cl_accelerator_type_intel    accelerator_type,
    size_t                       descriptor_size,
    const void*                  descriptor,
    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetAcceleratorInfoINTEL(
    cl_accelerator_intel         accelerator,
    cl_accelerator_info_intel    param_name,
    size_t                       param_value_size,
    void*                        param_value,
    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
    cl_accelerator_intel         accelerator,
    cl_accelerator_info_intel    param_name,
    size_t                       param_value_size,
    void*                        param_value,
    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainAcceleratorINTEL(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clReleaseAcceleratorINTEL(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
 /******************************************
 * cl_intel_simultaneous_sharing extension *
 *******************************************/
 #define cl_intel_simultaneous_sharing 1
 #define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
 #define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
 /***********************************
 * cl_intel_egl_image_yuv extension *
 ************************************/
 #define cl_intel_egl_image_yuv 1
 #define CL_EGL_YUV_PLANE_INTEL                           0x4107
 /********************************
 * cl_intel_packed_yuv extension *
 *********************************/
 #define cl_intel_packed_yuv 1
 #define CL_YUYV_INTEL                                    0x4076
 #define CL_UYVY_INTEL                                    0x4077
 #define CL_YVYU_INTEL                                    0x4078
 #define CL_VYUY_INTEL                                    0x4079
 /********************************************
 * cl_intel_required_subgroup_size extension *
 *********************************************/
 #define cl_intel_required_subgroup_size 1
 #define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
 #define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
 #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
 /****************************************
 * cl_intel_driver_diagnostics extension *
 *****************************************/
 #define cl_intel_driver_diagnostics 1
 typedef cl_uint cl_diagnostics_verbose_level;
 #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
 /********************************
 * cl_intel_planar_yuv extension *
 *********************************/
 #define CL_NV12_INTEL                                       0x410E
 #define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
 #define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
 #define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
 #define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
 /*******************************************************
 * cl_intel_device_side_avc_motion_estimation extension *
 ********************************************************/
 #define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
 #define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
 #define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
 #define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */
 #define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */
 #define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
 #define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
 #define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
 #define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
 #define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
 #define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
 #define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
 #define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
 #define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
 #define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
 #define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
 #define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
 #define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
 #define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
 #define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
 #define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
 #define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
 #define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
 #define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
 #define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
 #define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
 #define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
 #define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
 #define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
 #define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
 #define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
 #define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
 #define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
 #define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
 #define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
 #define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
 #define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
 #define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
 #define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
 #define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
 #define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
 #define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
 #define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
 #define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
 #define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
 #define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
 #define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
 #define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
 #define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
 #define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
 #define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
 #define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
 #define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
 #define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
 #define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
 #define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
 #define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
 #define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3
 #define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
 #define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
 #define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
 #define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
 #define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
 #define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
 #define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
 #define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
 #define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1
 /*******************************************
 * cl_intel_unified_shared_memory extension *
 ********************************************/
 /* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */
 #define cl_intel_unified_shared_memory 1
 /* cl_device_info */
 #define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
 #define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
 #define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192
 #define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193
 #define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194
 typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
 /* cl_device_unified_shared_memory_capabilities_intel - bitfield */
 #define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
 #define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)
 #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)
 #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
 typedef cl_bitfield cl_mem_properties_intel;
 /* cl_mem_properties_intel */
 #define CL_MEM_ALLOC_FLAGS_INTEL        0x4195
 typedef cl_bitfield cl_mem_alloc_flags_intel;
 /* cl_mem_alloc_flags_intel - bitfield */
 #define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)
 typedef cl_uint cl_mem_info_intel;
 /* cl_mem_alloc_info_intel */
 #define CL_MEM_ALLOC_TYPE_INTEL         0x419A
 #define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
 #define CL_MEM_ALLOC_SIZE_INTEL         0x419C
 #define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
 /* Enum values 0x419E-0x419F are reserved for future queries. */
 typedef cl_uint cl_unified_shared_memory_type_intel;
 /* cl_unified_shared_memory_type_intel */
 #define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
 #define CL_MEM_TYPE_HOST_INTEL          0x4197
 #define CL_MEM_TYPE_DEVICE_INTEL        0x4198
 #define CL_MEM_TYPE_SHARED_INTEL        0x4199
 typedef cl_uint cl_mem_advice_intel;
 /* cl_mem_advice_intel */
 /* Enum values 0x4208-0x420F are reserved for future memory advices. */
 /* cl_kernel_exec_info */
 #define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
 #define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201
 #define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202
 #define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203
 /* cl_command_type */
 #define CL_COMMAND_MEMFILL_INTEL        0x4204
 #define CL_COMMAND_MEMCPY_INTEL         0x4205
 #define CL_COMMAND_MIGRATEMEM_INTEL     0x4206
 #define CL_COMMAND_MEMADVISE_INTEL      0x4207
 extern CL_API_ENTRY void* CL_API_CALL
 clHostMemAllocINTEL(
            cl_context context,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 typedef CL_API_ENTRY void* (CL_API_CALL *
 clHostMemAllocINTEL_fn)(
            cl_context context,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 extern CL_API_ENTRY void* CL_API_CALL
 clDeviceMemAllocINTEL(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 typedef CL_API_ENTRY void* (CL_API_CALL *
 clDeviceMemAllocINTEL_fn)(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 extern CL_API_ENTRY void* CL_API_CALL
 clSharedMemAllocINTEL(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 typedef CL_API_ENTRY void* (CL_API_CALL *
 clSharedMemAllocINTEL_fn)(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clMemFreeINTEL(
            cl_context context,
            void* ptr);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clMemFreeINTEL_fn)(
            cl_context context,
            void* ptr);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetMemAllocInfoINTEL(
            cl_context context,
            const void* ptr,
            cl_mem_info_intel param_name,
            size_t param_value_size,
            void* param_value,
            size_t* param_value_size_ret);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clGetMemAllocInfoINTEL_fn)(
            cl_context context,
            const void* ptr,
            cl_mem_info_intel param_name,
            size_t param_value_size,
            void* param_value,
            size_t* param_value_size_ret);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelArgMemPointerINTEL(
            cl_kernel kernel,
            cl_uint arg_index,
            const void* arg_value);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clSetKernelArgMemPointerINTEL_fn)(
            cl_kernel kernel,
            cl_uint arg_index,
            const void* arg_value);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMemsetINTEL(       /* Deprecated */
            cl_command_queue command_queue,
            void* dst_ptr,
            cl_int value,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clEnqueueMemsetINTEL_fn)(   /* Deprecated */
            cl_command_queue command_queue,
            void* dst_ptr,
            cl_int value,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMemFillINTEL(
            cl_command_queue command_queue,
            void* dst_ptr,
            const void* pattern,
            size_t pattern_size,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clEnqueueMemFillINTEL_fn)(
            cl_command_queue command_queue,
            void* dst_ptr,
            const void* pattern,
            size_t pattern_size,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMemcpyINTEL(
            cl_command_queue command_queue,
            cl_bool blocking,
            void* dst_ptr,
            const void* src_ptr,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clEnqueueMemcpyINTEL_fn)(
            cl_command_queue command_queue,
            cl_bool blocking,
            void* dst_ptr,
            const void* src_ptr,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 #ifdef CL_VERSION_1_2
 /* Because these APIs use cl_mem_migration_flags, they require
   OpenCL 1.2: */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMigrateMemINTEL(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_migration_flags flags,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clEnqueueMigrateMemINTEL_fn)(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_migration_flags flags,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 #endif
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueMemAdviseINTEL(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_advice_intel advice,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *
 clEnqueueMemAdviseINTEL_fn)(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_advice_intel advice,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);
 #ifdef __cplusplus
 }
 #endif
 #endif /* __CL_EXT_INTEL_H */
--- a/CL/cl_gl.h
+++ b/CL/cl_gl.h
@ -0,0 +1,159 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_GL_H
 #define __OPENCL_CL_GL_H
 #include <CL/cl.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef cl_uint     cl_gl_object_type;
 typedef cl_uint     cl_gl_texture_info;
 typedef cl_uint     cl_gl_platform_info;
 typedef struct __GLsync *cl_GLsync;
 /* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
 #define CL_GL_OBJECT_BUFFER                     0x2000
 #define CL_GL_OBJECT_TEXTURE2D                  0x2001
 #define CL_GL_OBJECT_TEXTURE3D                  0x2002
 #define CL_GL_OBJECT_RENDERBUFFER               0x2003
 #ifdef CL_VERSION_1_2
 #define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
 #define CL_GL_OBJECT_TEXTURE1D                  0x200F
 #define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
 #define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
 #endif
 /* cl_gl_texture_info           */
 #define CL_GL_TEXTURE_TARGET                    0x2004
 #define CL_GL_MIPMAP_LEVEL                      0x2005
 #ifdef CL_VERSION_1_2
 #define CL_GL_NUM_SAMPLES                       0x2012
 #endif
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLBuffer(cl_context     context,
                     cl_mem_flags   flags,
                     cl_GLuint      bufobj,
                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 #ifdef CL_VERSION_1_2
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLTexture(cl_context      context,
                      cl_mem_flags    flags,
                      cl_GLenum       target,
                      cl_GLint        miplevel,
                      cl_GLuint       texture,
                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 #endif
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLRenderbuffer(cl_context   context,
                           cl_mem_flags flags,
                           cl_GLuint    renderbuffer,
                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLObjectInfo(cl_mem                memobj,
                  cl_gl_object_type *   gl_object_type,
                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLTextureInfo(cl_mem               memobj,
                   cl_gl_texture_info   param_name,
                   size_t               param_value_size,
                   void *               param_value,
                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
                          cl_uint               num_objects,
                          const cl_mem *        mem_objects,
                          cl_uint               num_events_in_wait_list,
                          const cl_event *      event_wait_list,
                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
                          cl_uint               num_objects,
                          const cl_mem *        mem_objects,
                          cl_uint               num_events_in_wait_list,
                          const cl_event *      event_wait_list,
                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
 /* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture2D(cl_context      context,
                        cl_mem_flags    flags,
                        cl_GLenum       target,
                        cl_GLint        miplevel,
                        cl_GLuint       texture,
                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture3D(cl_context      context,
                        cl_mem_flags    flags,
                        cl_GLenum       target,
                        cl_GLint        miplevel,
                        cl_GLuint       texture,
                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 /* cl_khr_gl_sharing extension  */
 #define cl_khr_gl_sharing 1
 typedef cl_uint     cl_gl_context_info;
 /* Additional Error Codes  */
 #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
 /* cl_gl_context_info  */
 #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
 #define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
 /* Additional cl_context_properties  */
 #define CL_GL_CONTEXT_KHR                       0x2008
 #define CL_EGL_DISPLAY_KHR                      0x2009
 #define CL_GLX_DISPLAY_KHR                      0x200A
 #define CL_WGL_HDC_KHR                          0x200B
 #define CL_CGL_SHAREGROUP_KHR                   0x200C
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLContextInfoKHR(const cl_context_properties * properties,
                      cl_gl_context_info            param_name,
                      size_t                        param_value_size,
                      void *                        param_value,
                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
    const cl_context_properties * properties,
    cl_gl_context_info            param_name,
    size_t                        param_value_size,
    void *                        param_value,
    size_t *                      param_value_size_ret);
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_GL_H */
--- a/CL/cl_gl_ext.h
+++ b/CL/cl_gl_ext.h
@ -0,0 +1,40 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_CL_GL_EXT_H
 #define __OPENCL_CL_GL_EXT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <CL/cl_gl.h>
 /* 
 *  cl_khr_gl_event extension
 */
 #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
 extern CL_API_ENTRY cl_event CL_API_CALL
 clCreateEventFromGLsyncKHR(cl_context context,
                           cl_GLsync  cl_GLsync,
                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
 #ifdef __cplusplus
 }
 #endif
 #endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/CL/cl_half.h
+++ b/CL/cl_half.h
@ -0,0 +1,440 @@
 /*******************************************************************************
 * Copyright (c) 2019-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 /**
 * This is a header-only utility library that provides OpenCL host code with
 * routines for converting to/from cl_half values.
 *
 * Example usage:
 *
 *    #include <CL/cl_half.h>
 *    ...
 *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
 *    cl_float f = cl_half_to_float(h);
 */
 #ifndef OPENCL_CL_HALF_H
 #define OPENCL_CL_HALF_H
 #include <CL/cl_platform.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * Rounding mode used when converting to cl_half.
 */
 typedef enum
 {
  CL_HALF_RTE, // round to nearest even
  CL_HALF_RTZ, // round towards zero
  CL_HALF_RTP, // round towards positive infinity
  CL_HALF_RTN, // round towards negative infinity
 } cl_half_rounding_mode;
 /* Private utility macros. */
 #define CL_HALF_EXP_MASK 0x7C00
 #define CL_HALF_MAX_FINITE_MAG 0x7BFF
 /*
 * Utility to deal with values that overflow when converting to half precision.
 */
 static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
                                              uint16_t sign)
 {
  if (rounding_mode == CL_HALF_RTZ)
  {
    // Round overflow towards zero -> largest finite number (preserving sign)
    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTP && sign)
  {
    // Round negative overflow towards positive infinity -> most negative finite number
    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTN && !sign)
  {
    // Round positive overflow towards negative infinity -> largest finite number
    return CL_HALF_MAX_FINITE_MAG;
  }
  // Overflow to infinity
  return (sign << 15) | CL_HALF_EXP_MASK;
 }
 /*
 * Utility to deal with values that underflow when converting to half precision.
 */
 static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
                                               uint16_t sign)
 {
  if (rounding_mode == CL_HALF_RTP && !sign)
  {
    // Round underflow towards positive infinity -> smallest positive value
    return (sign << 15) | 1;
  }
  else if (rounding_mode == CL_HALF_RTN && sign)
  {
    // Round underflow towards negative infinity -> largest negative value
    return (sign << 15) | 1;
  }
  // Flush to zero
  return (sign << 15);
 }
 /**
 * Convert a cl_float to a cl_half.
 */
 static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
 {
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;
  f32.f = f;
  // Extract sign bit
  uint16_t sign = f32.i >> 31;
  // Extract FP32 exponent and mantissa
  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);
  // Remove FP32 exponent bias
  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;
  // Add FP16 exponent bias
  uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1;
  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;
  // Check for NaN / infinity
  if (f_exp == 0xFF)
  {
    if (f_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = f_mant >> lsb_pos;
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }
  // Check for zero
  if (!f_exp && !f_mant)
  {
    return (sign << 15);
  }
  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }
  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }
  // Check for value that will become denormal
  if (exp < -14)
  {
    // Denormal -> include the implicit 1 from the FP32 mantissa
    h_exp = 0;
    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);
    // Mantissa shift amount depends on exponent
    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
  }
  // Generate FP16 mantissa by shifting FP32 mantissa
  uint16_t h_mant = f_mant >> lsb_pos;
  // Check whether we need to round
  uint32_t halfway = 1 << (lsb_pos - 1);
  uint32_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((f_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((f_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((f_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((f_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }
  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }
  return (sign << 15) | (h_exp << 10) | h_mant;
 }
 /**
 * Convert a cl_double to a cl_half.
 */
 static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
 {
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_double d;
    uint64_t i;
  } f64;
  f64.d = d;
  // Extract sign bit
  uint16_t sign = f64.i >> 63;
  // Extract FP64 exponent and mantissa
  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);
  // Remove FP64 exponent bias
  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;
  // Add FP16 exponent bias
  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);
  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;
  // Check for NaN / infinity
  if (d_exp == 0x7FF)
  {
    if (d_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }
  // Check for zero
  if (!d_exp && !d_mant)
  {
    return (sign << 15);
  }
  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }
  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }
  // Check for value that will become denormal
  if (exp < -14)
  {
    // Include the implicit 1 from the FP64 mantissa
    h_exp = 0;
    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);
    // Mantissa shift amount depends on exponent
    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
  }
  // Generate FP16 mantissa by shifting FP64 mantissa
  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
  // Check whether we need to round
  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
  uint64_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((d_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((d_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((d_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((d_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }
  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }
  return (sign << 15) | (h_exp << 10) | h_mant;
 }
 /**
 * Convert a cl_half to a cl_float.
 */
 static inline cl_float cl_half_to_float(cl_half h)
 {
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;
  // Extract sign bit
  uint16_t sign = h >> 15;
  // Extract FP16 exponent and mantissa
  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
  uint16_t h_mant = h & 0x3FF;
  // Remove FP16 exponent bias
  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;
  // Add FP32 exponent bias
  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;
  // Check for NaN / infinity
  if (h_exp == 0x1F)
  {
    if (h_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
      f_mant |= 0x400000;
      f32.i = (sign << 31) | 0x7F800000 | f_mant;
      return f32.f;
    }
    else
    {
      // Infinity -> zero mantissa
      f32.i = (sign << 31) | 0x7F800000;
      return f32.f;
    }
  }
  // Check for zero / denormal
  if (h_exp == 0)
  {
    if (h_mant == 0)
    {
      // Zero -> zero exponent
      f_exp = 0;
    }
    else
    {
      // Denormal -> normalize it
      // - Shift mantissa to make most-significant 1 implicit
      // - Adjust exponent accordingly
      uint32_t shift = 0;
      while ((h_mant & 0x400) == 0)
      {
        h_mant <<= 1;
        shift++;
      }
      h_mant &= 0x3FF;
      f_exp -= shift - 1;
    }
  }
  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
  return f32.f;
 }
 #undef CL_HALF_EXP_MASK
 #undef CL_HALF_MAX_FINITE_MAG
 #ifdef __cplusplus
 }
 #endif
 #endif  /* OPENCL_CL_HALF_H */
--- a/CL/cl_icd.h
+++ b/CL/cl_icd.h
--- a/CL/cl_platform.h
+++ b/CL/cl_platform.h
--- a/CL/cl_va_api_media_sharing_intel.h
+++ b/CL/cl_va_api_media_sharing_intel.h
@ -0,0 +1,160 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_va_api_media_sharing_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #include <va/va.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************
 * cl_intel_va_api_media_sharing extension *
 *******************************************/
 #define cl_intel_va_api_media_sharing 1
 /* error codes */
 #define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
 #define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
 #define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
 #define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
 /* cl_va_api_device_source_intel */
 #define CL_VA_API_DISPLAY_INTEL                             0x4094
 /* cl_va_api_device_set_intel */
 #define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
 #define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
 /* cl_context_info */
 #define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
 /* cl_mem_info */
 #define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
 /* cl_image_info */
 #define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
 #define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
 typedef cl_uint cl_va_api_device_source_intel;
 typedef cl_uint cl_va_api_device_set_intel;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
    cl_platform_id                platform,
    cl_va_api_device_source_intel media_adapter_type,
    void*                         media_adapter,
    cl_va_api_device_set_intel    media_adapter_set,
    cl_uint                       num_entries,
    cl_device_id*                 devices,
    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
    cl_platform_id                platform,
    cl_va_api_device_source_intel media_adapter_type,
    void*                         media_adapter,
    cl_va_api_device_set_intel    media_adapter_set,
    cl_uint                       num_entries,
    cl_device_id*                 devices,
    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromVA_APIMediaSurfaceINTEL(
    cl_context                    context,
    cl_mem_flags                  flags,
    VASurfaceID*                  surface,
    cl_uint                       plane,
    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
    cl_context                    context,
    cl_mem_flags                  flags,
    VASurfaceID*                  surface,
    cl_uint                       plane,
    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireVA_APIMediaSurfacesINTEL(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseVA_APIMediaSurfacesINTEL(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
--- a/CL/cl_version.h
+++ b/CL/cl_version.h
@ -0,0 +1,81 @@
 /*******************************************************************************
 * Copyright (c) 2018-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __CL_VERSION_H
 #define __CL_VERSION_H
 /* Detect which version to target */
 #if !defined(CL_TARGET_OPENCL_VERSION)
 #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
 #define CL_TARGET_OPENCL_VERSION 220
 #endif
 #if CL_TARGET_OPENCL_VERSION != 100 && \
    CL_TARGET_OPENCL_VERSION != 110 && \
    CL_TARGET_OPENCL_VERSION != 120 && \
    CL_TARGET_OPENCL_VERSION != 200 && \
    CL_TARGET_OPENCL_VERSION != 210 && \
    CL_TARGET_OPENCL_VERSION != 220 && \
    CL_TARGET_OPENCL_VERSION != 300
 #pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)")
 #undef CL_TARGET_OPENCL_VERSION
 #define CL_TARGET_OPENCL_VERSION 220
 #endif
 /* OpenCL Version */
 #if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
 #define CL_VERSION_3_0  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
 #define CL_VERSION_2_2  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
 #define CL_VERSION_2_1  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
 #define CL_VERSION_2_0  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
 #define CL_VERSION_1_2  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
 #define CL_VERSION_1_1  1
 #endif
 #if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
 #define CL_VERSION_1_0  1
 #endif
 /* Allow deprecated APIs for older OpenCL versions. */
 #if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
 #define CL_USE_DEPRECATED_OPENCL_2_2_APIS
 #endif
 #if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
 #define CL_USE_DEPRECATED_OPENCL_2_1_APIS
 #endif
 #if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
 #define CL_USE_DEPRECATED_OPENCL_2_0_APIS
 #endif
 #if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #endif
 #if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #endif
 #if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
 #define CL_USE_DEPRECATED_OPENCL_1_0_APIS
 #endif
 #endif  /* __CL_VERSION_H */
--- a/CL/opencl.h
+++ b/CL/opencl.h
@ -0,0 +1,33 @@
 /*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
 #ifndef __OPENCL_H
 #define __OPENCL_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <CL/cl.h>
 #include <CL/cl_gl.h>
 #include <CL/cl_gl_ext.h>
 #include <CL/cl_ext.h>
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_H   */
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,190 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(BabelStream VERSION 3.5 LANGUAGES CXX)
 #set(CMAKE_VERBOSE_MAKEFILE ON)
 # some nicer defaults for standard C++
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 #set(MODEL SYCL)
 #set(SYCL_COMPILER COMPUTECPP)
 #set(SYCL_COMPILER_DIR /home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu)
 #set(MODEL RAJA)
 #set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.13.0/)
 #set(ENABLE_CUDA ON)
 #set(TARGET NVIDIA)
 #set(CUDA_TOOLKIT_ROOT_DIR /opt/cuda-11.2)
 #set(CUDA_ARCH sm_70)
 #set(BLT_DIR /home/tom/Downloads/blt-0.3.6/)
 #set(MODEL STD)
 #set(ARCH  cc70)
 #set(CXX_EXTRA_FLAGS -v)
 #set(MODEL CUDA)
 #set(ARCH  sm_70)
 #set(CMAKE_CUDA_COMPILER /opt/cuda-11.2/bin/nvcc)
 #set(MODEL OCL)
 #set(OpenCL_LIBRARY /opt/rocm-4.0.0/opencl/lib/libOpenCL.so)
 #set(OpenCL_INCLUDE_DIR /opt/rocm-4.0.0/opencl/lib)
 #set(RELEASE_FLAGS -Ofast)
 #set(CXX_EXTRA_FLAGS -O2)
 #set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++)
 #set(MODEL OMP)
 ##set(OFFLOAD "AMD:gfx803")
 #set(OFFLOAD "NVIDIA:sm_35")
 #set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/)
 #set(OFFLOAD "AMD:_70")
 #set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/ --gcc-toolchain=/home/tom/spack/opt/spack/linux-fedora33-zen2/gcc-10.2.1/gcc-8.3.0-latmjo2hl2yv53255xkwko7k3y7bx2vv)
 #set(CXX_EXTRA_LINKER_FLAGS  )
 #set(MODEL HIP)
 #set(MODEL KOKKOS)
 #set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.3.00/)
 # the final executable name
 set(EXE_NAME babelstream)
 # select default build type
 set(CMAKE_BUILD_TYPE "Release")
 # for chrono and some basic CXX features, models can overwrite this if required
 set(CMAKE_CXX_STANDARD 11)
 if (NOT CMAKE_BUILD_TYPE)
    message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
    set(CMAKE_BUILD_TYPE "Release")
 endif ()
 string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE)
 if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
    message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
 set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
 macro(hint_flag FLAG DESCRIPTION)
    if (NOT DEFINED ${FLAG})
        message(STATUS "${FLAG}: ${DESCRIPTION}")
    else ()
        # i.e. `-DFOO="-a -b"` becomes CMake's semicolon separated list `FOO=`-a;-b`
        separate_arguments(${FLAG})
    endif ()
 endmacro()
 # hint common extra flag options for all models if they are not set
 hint_flag(CXX_EXTRA_FLAGS "
        Appends to common compile flags. These will be appended at link phase as well.
        To use separate flags at link phase, set `CXX_EXTRA_LINK_FLAGS`")
 hint_flag(CXX_EXTRA_LINK_FLAGS "
        Appends to link flags which appear *before* the objects.
        Do not use this for linking libraries, as the link line is order-dependent")
 hint_flag(CXX_EXTRA_LIBRARIES "
        Append to link flags which appear *after* the objects.
        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)")
 hint_flag(CXX_EXTRA_LINKER_FLAGS "
        Append to linker flags (i.e GCC's `-Wl` or equivalent)")
 # copy CXX_EXTRA_FLAGS <- CXX_EXTRA_LINK_FLAGS
 if ((DEFINED CXX_EXTRA_FLAGS) AND (NOT DEFINED CXX_EXTRA_LINK_FLAGS))
    set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS})
 endif ()
 # include our macros
 include(register_models.cmake)
 # register out models <model_name> <preprocessor_def_name> <source files...>
 register_model(OMP OMP OMPStream.cpp)
 register_model(OCL OCL OCLStream.cpp)
 register_model(STD STD STDStream.cpp)
 register_model(STD20 STD20 STD20Stream.cpp)
 register_model(HIP HIP HIPStream.cpp)
 register_model(CUDA CUDA CUDAStream.cu)
 register_model(KOKKOS KOKKOS KokkosStream.cpp)
 register_model(SYCL SYCL SYCLStream.cpp)
 register_model(ACC ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(RAJA USE_RAJA RAJAStream.cpp)
 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
 message(STATUS "Available models:  ${REGISTERED_MODELS}")
 if (NOT DEFINED MODEL)
    message(FATAL_ERROR "MODEL is unspecified, pick one from the available models")
 else ()
    message(STATUS "Selected model  :  ${MODEL}")
 endif ()
 # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL
 load_model(${MODEL})
 if (USAGE)
    # print the usage of the registered flag options
    registered_flags_action(print RESULT)
    message(STATUS "${RESULT}")
 endif ()
 # check required/set default for all registered flag options
 registered_flags_action(check RESULT)
 message(STATUS "${RESULT}")
 # run model specific setup, i.e append build flags, etc
 setup()
 # CMake insists that -O2 (or equivalent) is the universally accepted optimisation level
 # we remove that here and use our own <BUILD_TYPE>_FLAGS
 wipe_gcc_style_optimisation_flags(CMAKE_CXX_FLAGS_${BUILD_TYPE})
 message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_FLAGS}`, set ${BUILD_TYPE}_FLAGS to override (CXX_EXTRA_* flags are not affected)")
 # setup common build flag defaults if there are no overrides
 if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
    set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS})
    elseif()
    set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS})
 endif ()
 message(STATUS "CXX vendor  : ${CMAKE_CXX_COMPILER_ID} (${CMAKE_CXX_COMPILER})")
 message(STATUS "Platform    : ${CMAKE_SYSTEM_PROCESSOR}")
 message(STATUS "Sources     : ${IMPL_SOURCES}")
 message(STATUS "Libraries   : ${LINK_LIBRARIES}")
 message(STATUS "CXX Flags   : ${CMAKE_CXX_FLAGS_${BUILD_TYPE}} ${ACTUAL_${BUILD_TYPE}_FLAGS} ${CXX_EXTRA_FLAGS}
    CXX flags derived from (CMake + (Override ? Override : Default) + Extras), where:
        CMake                    = `${CMAKE_CXX_FLAGS_${BUILD_TYPE}}`
        Default                  = `${DEFAULT_${BUILD_TYPE}_FLAGS}`
        Override (RELEASE_FLAGS) = `${${BUILD_TYPE}_FLAGS}`
        Extras (CXX_EXTRA_FLAGS) = `${CXX_EXTRA_FLAGS}`")
 message(STATUS "Link Flags  : ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS}")
 message(STATUS "Linker Flags: ${CMAKE_EXE_LINKER_FLAGS} ${CXX_EXTRA_LINKER_FLAGS} ")
 message(STATUS "Defs        : ${IMPL_DEFINITIONS}")
 message(STATUS "Executable  : ${EXE_NAME}")
 # below we have all the usual CMake target setup steps
 add_executable(${EXE_NAME} ${IMPL_SOURCES} main.cpp)
 target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
 target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
 if (CXX_EXTRA_LIBRARIES)
    target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})
 endif ()
 target_compile_options(${EXE_NAME} PUBLIC "$<$<CONFIG:Release>:${ACTUAL_RELEASE_FLAGS};${CXX_EXTRA_FLAGS}>")
 target_compile_options(${EXE_NAME} PUBLIC "$<$<CONFIG:Debug>:${ACTUAL_DEBUG_FLAGS};${CXX_EXTRA_FLAGS}>")
 target_link_options(${EXE_NAME} PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS})
 target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
 # some models require the target to be already specified so they can finish their setup here
 # this only happens if the MODEL.cmake definition contains the `setup_target` macro
 if (COMMAND setup_target)
    setup_target(${EXE_NAME})
 endif ()
--- a/CUDA.cmake
+++ b/CUDA.cmake
@ -0,0 +1,43 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection, this is used for host compilation"
        "c++")
 register_flag_optional(MEM "Device memory mode:
        DEFAULT   - allocate host and device memory pointers.
        MANAGED   - use CUDA Managed Memory.
        PAGEFAULT - shared memory, only host pointers allocated."
        "DEFAULT")
 register_flag_required(CMAKE_CUDA_COMPILER
        "Path to the CUDA nvcc compiler")
 # XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
 register_flag_required(CUDA_ARCH
        "Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc")
 register_flag_optional(CUDA_EXTRA_FLAGS
        "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
        "")
 macro(setup)
    # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
    if(POLICY CMP0104)
        cmake_policy(SET CMP0104 OLD)
    endif()
    enable_language(CUDA)
    register_definitions(MEM=${MEM})
    # add -forward-unknown-to-host-compiler for compatibility reasons
    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
    # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
    # appended later
    wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
    message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
 endmacro()
--- a/HIP.cmake
+++ b/HIP.cmake
@ -0,0 +1,7 @@
 register_flag_required(CMAKE_CXX_COMPILER
        "Absolute path to the AMD HIP C++ compiler")
 macro(setup)
    # nothing to do here as hipcc does everything correctly, what a surprise!
 endmacro()
--- a/KOKKOS.cmake
+++ b/KOKKOS.cmake
@ -0,0 +1,40 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection and RAJA.
         See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
        "c++")
 register_flag_required(KOKKOS_IN_TREE
        "Absolute path to the *source* distribution directory of Kokkos.
         Remember to append Kokkos specific flags as well, for example:
             -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
         See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
 # compiler vendor and arch specific flags
 set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
 macro(setup)
    set(CMAKE_CXX_STANDARD 14)
    cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
    message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
    if (EXISTS "${KOKKOS_IN_TREE}")
        add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
        register_link_library(Kokkos::kokkos)
    else ()
        message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
    endif ()
    register_append_compiler_and_arch_specific_cxx_flags(
            KOKKOS_FLAGS_CPU
            ${CMAKE_CXX_COMPILER_ID}
            ${CMAKE_SYSTEM_PROCESSOR}
    )
 endmacro()
--- a/OCL.cmake
+++ b/OCL.cmake
@ -0,0 +1,17 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")
 register_flag_optional(OpenCL_LIBRARY
        "Path to OpenCL library, usually called libOpenCL.so"
        "${OpenCL_LIBRARY}")
 macro(setup)
    # don't point to the CL dir as the imports already have the CL prefix
    set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}")
    find_package(OpenCL REQUIRED)
    register_link_library(OpenCL::OpenCL)
 endmacro()
--- a/OMP.cmake
+++ b/OMP.cmake
@ -0,0 +1,174 @@
 # Compiler ID for reference (as of CMake 3.13)
 #    Absoft = Absoft Fortran (absoft.com)
 #    ADSP = Analog VisualDSP++ (analog.com)
 #    AppleClang = Apple Clang (apple.com)
 #    ARMCC = ARM Compiler (arm.com)
 #    Bruce = Bruce C Compiler
 #    CCur = Concurrent Fortran (ccur.com)
 #    Clang = LLVM Clang (clang.llvm.org)
 #    Cray = Cray Compiler (cray.com)
 #    Embarcadero, Borland = Embarcadero (embarcadero.com)
 #    G95 = G95 Fortran (g95.org)
 #    GNU = GNU Compiler Collection (gcc.gnu.org)
 #    HP = Hewlett-Packard Compiler (hp.com)
 #    IAR = IAR Systems (iar.com)
 #    Intel = Intel Compiler (intel.com)
 #    MIPSpro = SGI MIPSpro (sgi.com)
 #    MSVC = Microsoft Visual Studio (microsoft.com)
 #    NVIDIA = NVIDIA CUDA Compiler (nvidia.com)
 #    OpenWatcom = Open Watcom (openwatcom.org)
 #    PGI = The Portland Group (pgroup.com)
 #    Flang = Flang Fortran Compiler
 #    PathScale = PathScale (pathscale.com)
 #    SDCC = Small Device C Compiler (sdcc.sourceforge.net)
 #    SunPro = Oracle Solaris Studio (oracle.com)
 #    TI = Texas Instruments (ti.com)
 #    TinyCC = Tiny C Compiler (tinycc.org)
 #    XL, VisualAge, zOS = IBM XL (ibm.com)
 # These are only added in CMake 3.15:
 #    ARMClang = ARM Compiler based on Clang (arm.com)
 # These are only added in CMake 3.20:
 #    NVHPC = NVIDIA HPC SDK Compiler (nvidia.com)
 # CMAKE_SYSTEM_PROCESSOR is set via `uname -p`, we have:
 # Power9 = ppc64le
 # x64    = x86_64
 # arm64  = aarch64
 #
 #predefined offload flags based on compiler id
 set(OMP_FLAGS_OFFLOAD_INTEL
        -qnextgen -fiopenmp -fopenmp-targets=spir64)
 set(OMP_FLAGS_OFFLOAD_GNU_NVIDIA
        -foffload=nvptx-none)
 set(OMP_FLAGS_OFFLOAD_GNU_AMD
        -foffload=amdgcn-amdhsa)
 set(OMP_FLAGS_OFFLOAD_CLANG_NVIDIA
        -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda)
 set(OMP_FLAGS_OFFLOAD_CLANG_AMD
        -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa)
 set(OMP_FLAGS_OFFLOAD_CLANG_ARCH_FLAG
        -march=) # prefix only, arch appended by the vendor:arch tuple
 set(OMP_FLAGS_CPU_INTEL
        -qopt-streaming-stores=always)
 set(OMP_FLAGS_CPU_GNU_PPC64LE
        -mcpu=native)
 set(OMP_FLAGS_CPU_XL
        -O5 -qarch=auto -qtune=auto)
 # NEC
 set(OMP_FLAGS_CPU_NEC -O4 -finline)
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that supports OpenMP as per CMake detection (and offloading if enabled with `OFFLOAD`)"
        "c++")
 register_flag_optional(ARCH
        "This overrides CMake's CMAKE_SYSTEM_PROCESSOR detection which uses (uname -p), this is mainly for use with
         specialised accelerators only and not to be confused with offload which is is mutually exclusive with this.
         Supported values are:
          - NEC"
        "")
 register_flag_optional(OFFLOAD
        "Whether to use OpenMP offload, the format is <VENDOR:ARCH?>|ON|OFF.
        We support a small set of known offload flags for clang, gcc, and icpx.
        However, as offload support is rapidly evolving, we recommend you directly supply them via OFFLOAD_FLAGS.
        For example:
          * OFFLOAD=NVIDIA:sm_60
          * OFFLOAD=AMD:gfx906
          * OFFLOAD=INTEL
          * OFFLOAD=ON OFFLOAD_FLAGS=..."
        OFF)
 register_flag_optional(OFFLOAD_FLAGS
        "If OFFLOAD is enabled, this *overrides* the default offload flags"
        "")
 register_flag_optional(OFFLOAD_APPEND_LINK_FLAG
        "If enabled, this appends all resolved offload flags (OFFLOAD=<vendor:arch> or directly from OFFLOAD_FLAGS) to the link flags.
        This is required for most offload implementations so that offload libraries can linked correctly."
        ON)
 macro(setup)
    find_package(OpenMP REQUIRED)
    register_link_library(OpenMP::OpenMP_CXX)
    string(TOUPPER ${CMAKE_CXX_COMPILER_ID} COMPILER)
    if(NOT ARCH)
        string(TOUPPER ${CMAKE_SYSTEM_PROCESSOR} ARCH)
    else()
        message(STATUS "Using custom arch: ${ARCH}")
    endif()
    if (("${OFFLOAD}" STREQUAL OFF) OR (NOT DEFINED OFFLOAD))
        # no offload
        # resolve the CPU specific flags
        # starting with ${COMPILER_VENDOR}_${PLATFORM_ARCH}, then try ${COMPILER_VENDOR}, and then give up
        register_append_compiler_and_arch_specific_cxx_flags(
                OMP_FLAGS_CPU
                ${COMPILER}
                ${ARCH}
        )
    elseif ("${OFFLOAD}" STREQUAL ON)
        #  offload but with custom flags
        register_definitions(OMP_TARGET_GPU)
        separate_arguments(OFFLOAD_FLAGS)
        set(OMP_FLAGS ${OFFLOAD_FLAGS})
    elseif ((DEFINED OFFLOAD) AND OFFLOAD_FLAGS)
        # offload but OFFLOAD_FLAGS overrides
        register_definitions(OMP_TARGET_GPU)
        separate_arguments(OFFLOAD_FLAGS)
        list(OMP_FLAGS APPEND ${OFFLOAD_FLAGS})
    else ()
        # handle the vendor:arch value
        string(REPLACE ":" ";" OFFLOAD_TUPLE "${OFFLOAD}")
        list(LENGTH OFFLOAD_TUPLE LEN)
        if (LEN EQUAL 1)
            #  offload with <vendor> tuple
            list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR)
            # append OMP_FLAGS_OFFLOAD_<vendor> if  exists
            list(APPEND OMP_FLAGS ${OMP_FLAGS_OFFLOAD_${OFFLOAD_VENDOR}})
        elseif (LEN EQUAL 2)
            #  offload with <vendor:arch> tuple
            list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR)
            list(GET OFFLOAD_TUPLE 1 OFFLOAD_ARCH)
            # append OMP_FLAGS_OFFLOAD_<compiler>_<vendor> if exists
            list(APPEND OMP_FLAGS ${OMP_FLAGS_OFFLOAD_${COMPILER}_${OFFLOAD_VENDOR}})
            # append offload arch if OMP_FLAGS_OFFLOAD_<compiler>_ARCH_FLAG if exists
            if (DEFINED OMP_FLAGS_OFFLOAD_${COMPILER}_ARCH_FLAG)
                list(APPEND OMP_FLAGS
                        "${OMP_FLAGS_OFFLOAD_${COMPILER}_ARCH_FLAG}${OFFLOAD_ARCH}")
            endif ()
        else ()
            message(FATAL_ERROR "Unrecognised OFFLOAD format: `${OFFLOAD}`, consider directly using OFFLOAD_FLAGS")
        endif ()
    endif ()
    message(STATUS "OMP CXX  flags : ${OMP_FLAGS}")
    message(STATUS "OMP Link flags : ${OMP_LINK_FLAGS}")
    # propagate flags to linker so that it links with the offload stuff as well
    register_append_cxx_flags(ANY ${OMP_FLAGS})
    if (OFFLOAD_APPEND_LINK_FLAG)
        register_append_link_flags(${OMP_FLAGS})
    endif ()
 endmacro()
--- a/RAJA.cmake
+++ b/RAJA.cmake
@ -0,0 +1,98 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection and RAJA.
         See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
        "c++")
 register_flag_required(RAJA_IN_TREE
        "Absolute path to the *source* distribution directory of RAJA.
         Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
         Remember to append RAJA specific flags as well, for example:
             -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
         See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
 ")
 register_flag_optional(TARGET
        "Target offload device, implemented values are CPU, NVIDIA"
        CPU)
 register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
        "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")
 # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
 register_flag_optional(CUDA_ARCH
        "[TARGET==NVIDIA only] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
        "")
 register_flag_optional(CUDA_EXTRA_FLAGS
        "[TARGET==NVIDIA only] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
        "")
 # compiler vendor and arch specific flags
 set(RAJA_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
 macro(setup)
    if (${TARGET} STREQUAL "CPU")
        register_definitions(RAJA_TARGET_CPU)
    else ()
        register_definitions(RAJA_TARGET_GPU)
    endif ()
    if (EXISTS "${RAJA_IN_TREE}")
        message(STATUS "Building using in-tree RAJA source at `${RAJA_IN_TREE}`")
        set(CMAKE_CXX_STANDARD 14)
        # don't build anything that isn't the RAJA library itself, by default their cmake def builds everything, whyyy?
        set(ENABLE_TESTS OFF CACHE BOOL "")
        set(ENABLE_EXAMPLES OFF CACHE BOOL "")
        set(ENABLE_REPRODUCERS OFF CACHE BOOL "")
        set(ENABLE_EXERCISES OFF CACHE BOOL "")
        set(ENABLE_DOCUMENTATION OFF CACHE BOOL "")
        set(ENABLE_BENCHMARKS OFF CACHE BOOL "")
        set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE)
        if (ENABLE_CUDA)
            # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
            if(POLICY CMP0104)
                cmake_policy(SET CMP0104 OLD)
            endif()
            # RAJA needs all the cuda stuff setup before including!
            set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
            list(APPEND CMAKE_CUDA_FLAGS)
            message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
        endif ()
        add_subdirectory(${RAJA_IN_TREE} ${CMAKE_BINARY_DIR}/raja)
        register_link_library(RAJA)
        # RAJA's cmake screws with where the binary will end up, resetting it here:
        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
    else ()
        message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
    endif ()
    if (ENABLE_CUDA)
        # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
        enable_language(CUDA)
        set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
        set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
    endif ()
    register_append_compiler_and_arch_specific_cxx_flags(
            RAJA_FLAGS_CPU
            ${CMAKE_CXX_COMPILER_ID}
            ${CMAKE_SYSTEM_PROCESSOR}
    )
 endmacro()
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -5,10 +5,10 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include <stdexcept>
 #include "RAJAStream.hpp"
 using RAJA::forall;
 using RAJA::RangeSegment;
 #ifndef ALIGNMENT
 #define ALIGNMENT (2*1024*1024) // 2MB
@ -16,10 +16,8 @@ using RAJA::RangeSegment;
 template <class T>
 RAJAStream<T>::RAJAStream(const int ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE)
+    : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE)
 {
  RangeSegment seg(0, ARRAY_SIZE);
  index_set.push_back(seg);
 #ifdef RAJA_TARGET_CPU
  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
@ -53,7 +51,7 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
  T* RAJA_RESTRICT a = d_a;
  T* RAJA_RESTRICT b = d_b;
  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = initA;
    b[index] = initB;
@ -75,7 +73,7 @@ void RAJAStream<T>::copy()
 {
  T* RAJA_RESTRICT a = d_a;
  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index];
  });
@ -87,7 +85,7 @@ void RAJAStream<T>::mul()
  T* RAJA_RESTRICT b = d_b;
  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    b[index] = scalar*c[index];
  });
@ -99,7 +97,7 @@ void RAJAStream<T>::add()
  T* RAJA_RESTRICT a = d_a;
  T* RAJA_RESTRICT b = d_b;
  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index] + b[index];
  });
@ -112,12 +110,20 @@ void RAJAStream<T>::triad()
  T* RAJA_RESTRICT b = d_b;
  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = b[index] + scalar*c[index];
  });
 }
 template <class T>
 void RAJAStream<T>::nstream()
 {
  // TODO implement me!
  std::cerr << "Not implemented yet!" << std::endl;
  throw std::runtime_error("Not implemented yet!");
 }
 template <class T>
 T RAJAStream<T>::dot()
 {
@ -126,7 +132,7 @@ T RAJAStream<T>::dot()
  RAJA::ReduceSum<reduce_policy, T> sum(0.0);
-  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
+  forall<policy>(range, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    sum += a[index] * b[index];
  });
--- a/RAJAStream.hpp
+++ b/RAJAStream.hpp
@ -8,34 +8,41 @@
 #include <iostream>
 #include <stdexcept>
-#include "RAJA/RAJA.hxx"
+#include "RAJA/RAJA.hpp"
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "RAJA"
 #ifdef RAJA_TARGET_CPU
-typedef RAJA::IndexSet::ExecPolicy<
+// TODO verify old and new templates are semantically equal
-        RAJA::seq_segit,
+//typedef RAJA::ExecPolicy<
-        RAJA::omp_parallel_for_exec> policy;
+//        RAJA::seq_segit,
 //        RAJA::omp_parallel_for_exec> policy;
 typedef RAJA::omp_parallel_for_exec policy;
 typedef RAJA::omp_reduce reduce_policy;
 #else
 const size_t block_size = 128;
-typedef RAJA::IndexSet::ExecPolicy<
+// TODO verify old and new templates are semantically equal
-        RAJA::seq_segit,
+//typedef RAJA::IndexSet::ExecPolicy<
-        RAJA::cuda_exec<block_size>> policy;
+//        RAJA::seq_segit,
-typedef RAJA::cuda_reduce<block_size> reduce_policy;
+//        RAJA::cuda_exec<block_size>> policy;
 //typedef RAJA::cuda_reduce<block_size> reduce_policy;
 typedef RAJA::cuda_exec<block_size> policy;
 typedef RAJA::cuda_reduce reduce_policy;
 #endif
 using RAJA::RangeSegment;
 template <class T>
 class RAJAStream : public Stream<T>
 {
  protected:
    // Size of arrays
-    int array_size;
+	const int array_size;
-
+	const RangeSegment range;
    // Contains iteration space
    RAJA::IndexSet index_set;
    // Device side pointers to arrays
    T* d_a;
@ -51,6 +58,7 @@ class RAJAStream : public Stream<T>
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
 	virtual void nstream() override;
 	virtual T dot() override;
    virtual void init_arrays(T initA, T initB, T initC) override;
--- a/README.md
+++ b/README.md
@ -57,6 +57,53 @@ Usage
 Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
 ### CMake
 The project supports building with CMake >= 3.13.0, it can be installed without root via the [official script](https://cmake.org/download/).
 As with any CMake project, first configure the project:
 ```shell
 > cd babelstream
 > cmake -Bbuild -H. -DMODEL=<model> <model specific flags prefixed with -D...> # configure the build, build type defaults to Release 
 > cmake --build build # compile it 
 > ./build/babelstream # executable available at ./build/
 ```
 By default, we have defined a set of optimal flags for known HPC compilers.
 There are assigned those to `RELEASE_FLAGS`, and you can override them if required.
 To find out what flag each model supports or requires, simply configure while only specifying the model.
 For example:
 ```shell
 > cd babelstream
 > cmake -Bbuild -H. -DMODEL=OCL 
 ...
 - Common Release flags are `-O3`, set RELEASE_FLAGS to override
 -- CXX_EXTRA_FLAGS: 
        Appends to common compile flags. These will be used at link phase at well.
        To use separate flags at link time, set `CXX_EXTRA_LINKER_FLAGS`
 -- CXX_EXTRA_LINK_FLAGS: 
        Appends to link flags which appear *before* the objects.
        Do not use this for linking libraries, as the link line is order-dependent
 -- CXX_EXTRA_LIBRARIES: 
        Append to link flags which appears *after* the objects.
        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) 
 -- CXX_EXTRA_LINKER_FLAGS: 
        Append to linker flags (i.e GCC's `-Wl` or equivalent)
 -- Available models:  OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA
 -- Selected model  :  OCL
 -- Supported flags:
   CMAKE_CXX_COMPILER (optional, default=c++): Any CXX compiler that is supported by CMake detection
   OpenCL_LIBRARY (optional, default=): Path to OpenCL library, usually called libOpenCL.so
 ...
 ```
 Alternatively, refer to the [CI script](./ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there.
 *It is recommended that you delete the `build` directory when you change any of the build flags.* 
 ### GNU Make
 We have supplied a series of Makefiles, one for each programming model, to assist with building.
 The Makefiles contain common build options, and should be simple to customise for your needs too.
@ -68,8 +115,7 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
 The binaries are named in the form `<model>-stream`.
-Building Kokkos
+#### Building Kokkos for Make
 ---------------
 Kokkos version >= 3 requires setting the `KOKKOS_PATH` flag to the *source* directory of a distribution. 
 For example:
@ -83,8 +129,7 @@ make -f Kokkos.make KOKKOS_PATH=~/kokkos-3.1.01
 ```
 See make output for more information on supported flags.
-Building RAJA
+#### Building RAJA for Make
 -------------
 We use the following command to build RAJA using the Intel Compiler.
 ```
--- a/STD.cmake
+++ b/STD.cmake
@ -0,0 +1,33 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection"
        "c++")
 register_flag_optional(NVHPC_OFFLOAD
        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
         Possible values are:
           cc35  - Compile for compute capability 3.5
           cc50  - Compile for compute capability 5.0
           cc60  - Compile for compute capability 6.0
           cc62  - Compile for compute capability 6.2
           cc70  - Compile for compute capability 7.0
           cc72  - Compile for compute capability 7.2
           cc75  - Compile for compute capability 7.5
           cc80  - Compile for compute capability 8.0
           ccall - Compile for all supported compute capabilities"
        "")
 macro(setup)
    set(CMAKE_CXX_STANDARD 17)
    if (NVHPC_OFFLOAD)
        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
        # propagate flags to linker so that it links with the gpu stuff as well
        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
        register_append_link_flags(${NVHPC_FLAGS})
    endif ()
 endmacro()
--- a/STD20.cmake
+++ b/STD20.cmake
@ -0,0 +1,16 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
        "c++")
 macro(setup)
    # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
    # C++ 2a is too new, disable CMake's std flags completely:
    set(CMAKE_CXX_EXTENSIONS OFF)
    set(CMAKE_CXX_STANDARD_REQUIRED OFF)
    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
    # and append our own:
    register_append_cxx_flags(ANY -std=c++2a)
 endmacro()
--- a/STDStream.h
+++ b/STDStream.h
@ -24,7 +24,7 @@ class STDStream : public Stream<T>
    T *c;
  public:
-    STDStream(const int, int);
+    STDStream(const int, int) noexcept;
    ~STDStream();
    virtual void copy() override;
--- a/SYCL.cmake
+++ b/SYCL.cmake
@ -0,0 +1,87 @@
 register_flag_optional(CMAKE_CXX_COMPILER
        "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
        "c++")
 register_flag_required(SYCL_COMPILER
        "Compile using the specified SYCL compiler implementation
        Supported values are
           ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
           DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
           COMPUTECPP   - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
 register_flag_optional(SYCL_COMPILER_DIR
        "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
           ONEAPI-DPCPP             - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first)
           HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
        "")
 register_flag_optional(OpenCL_LIBRARY
        "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so"
        "${OpenCL_LIBRARY}")
 macro(setup)
    set(CMAKE_CXX_STANDARD 17)
    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
        if (NOT EXISTS "${hipSYCL_DIR}")
            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
        endif ()
        if (NOT EXISTS "${hipSYCL_DIR}")
            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
        endif ()
        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
        find_package(hipSYCL CONFIG REQUIRED)
        message(STATUS "ok")
    elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")
        list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
        set(ComputeCpp_DIR ${SYCL_COMPILER_DIR})
        # don't point to the CL dir as the imports already have the CL prefix
        set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/CL")
        register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0)
        # ComputeCpp needs OpenCL
        find_package(ComputeCpp REQUIRED)
        # this must come after FindComputeCpp (!)
        set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)
    elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
        set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
        include_directories(${SYCL_COMPILER_DIR}/include/sycl)
        register_definitions(CL_TARGET_OPENCL_VERSION=220)
        register_append_cxx_flags(ANY -fsycl)
        register_append_link_flags(-fsycl)
    elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP")
        set(CMAKE_CXX_COMPILER dpcpp)
        register_definitions(CL_TARGET_OPENCL_VERSION=220)
    else ()
        message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
    endif ()
 endmacro()
 macro(setup_target NAME)
    if (
    (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
    (${SYCL_COMPILER} STREQUAL "HIPSYCL"))
        # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
        # own custom integration header flags AFTER the target has been specified
        # hence this macro here
        add_sycl_to_target(
                TARGET ${NAME}
                SOURCES ${IMPL_SOURCES})
    endif ()
 endmacro()
--- a/ci-prepare-bionic.sh
+++ b/ci-prepare-bionic.sh
@ -0,0 +1,375 @@
 #!/usr/bin/env bash
 set -eu
 WORK_DIR="${1:-.}"
 MODE="${2:-SETUP}"
 PARALLEL="${3:-false}"
 FORCE_DOWNLOAD=false
 if [ "$_" = "$0" ]; then
  echo "This script must be sourced for the exports to work!"
  exit 1
 fi
 case "$MODE" in
 SETUP)
  SETUP=true
  echo "Preparing env with setup..."
  ;;
 VARS)
  SETUP=false
  echo "Preparing env without setup..."
  ;;
 *)
  echo "Bad option"
  echo "$0 <work_dir:dir> <VARS|SETUP> <parallel:bool>"
  exit 1
  ;;
 esac
 mkdir -p "$WORK_DIR"
 PREV_DIR="$PWD"
 cd "$WORK_DIR" || exit 1
 export_var() {
  export "$1"="$2"
  # see
  # https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable
  if [ "${GITHUB_ACTIONS:-false}" = true ]; then
    echo "$1=$2" >>"$GITHUB_ENV"
  fi
 }
 check_size() {
  if [ "$SETUP" = true ]; then
    echo "Used space for $PWD"
    du -sh .
    df -h .
  fi
 }
 get_and_install_deb() {
  local name="$1"
  local install_dir="$2"
  local pkg_url="$3"
  shift
  local wget_args=("$@")
  local pkg_name="$name.deb"
  if [ "$SETUP" = true ]; then
    if [ ! -f "$pkg_name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
      echo "$pkg_name not found, downloading"
      rm -f "$pkg_name"
      # shellcheck disable=SC2086
      wget -q --show-progress --progress=bar:force:noscroll "${wget_args[@]}" "$pkg_url" -O "$pkg_name"
    fi
    #    rm -rf "$install_dir"
    echo "Preparing to install $pkg_name locally to $install_dir ..."
    dpkg-deb -x "$pkg_name" "$install_dir"
    echo "$pkg_name installed, deleting $pkg_name ..."
    rm -f "$pkg_name" # delete for space
  fi
 }
 get() {
  local name="$1"
  local pkg_url="$2"
  if [ "$SETUP" = true ]; then
    if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
      echo "$name not found, downloading..."
      wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
    fi
  fi
 }
 get_and_untar() {
  local name="$1"
  local pkg_url="$2"
  if [ "$SETUP" = true ]; then
    if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then
      echo "$name not found, downloading..."
      wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name"
    fi
    echo "Preparing to extract $name ..."
    tar -xf "$name"
    echo "$name extracted, deleting archive ..."
    rm -f "$name" # delete for space
  fi
 }
 verify_bin_exists() {
  if [ ! -f "$1" ]; then
    echo "[FAIL] $1 does not exist or is not a file!"
    exit 1
  else echo "[OK! ] $1"; fi
 }
 verify_dir_exists() {
  if [ ! -d "$1" ]; then
    echo "[FAIL] $1 does not exist or is not a directory!"
    exit 1
  else echo "[OK! ] $1"; fi
 }
 setup_aocc() {
  echo "Preparing AOCC"
  local aocc_ver="2.3.0"
  local tarball="aocc-$aocc_ver.tar.xz"
  # XXX it's actually XZ compressed, so it should be tar.xz
  local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar"
  # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar"
  get_and_untar "$tarball" "$AOCC_URL"
  export_var AOCC_CXX "$PWD/aocc-compiler-$aocc_ver/bin/clang++"
  verify_bin_exists "$AOCC_CXX"
  "$AOCC_CXX" --version
  check_size
 }
 setup_nvhpc() {
  echo "Preparing Nvidia HPC SDK"
  local tarball="nvhpc.tar.gz"
 #  local url="http://localhost:8000/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz"
  local url="https://developer.download.nvidia.com/hpc-sdk/21.2/nvhpc_2021_212_Linux_x86_64_cuda_11.2.tar.gz"
  get_and_untar "$tarball" "$url"
  local sdk_dir="$PWD/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2"
  local bin_dir="$sdk_dir/compilers/bin"
  "$bin_dir/makelocalrc" "$bin_dir" -x
  export_var NVHPC_NVCXX "$bin_dir/nvc++"
  export_var NVHPC_NVCC "$sdk_dir/cuda/11.2/bin/nvcc"
  export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.2"
  echo "Installed CUDA versions:"
  ls "$sdk_dir/cuda"
  verify_bin_exists "$NVHPC_NVCXX"
  verify_bin_exists "$NVHPC_NVCC"
  "$NVHPC_NVCXX" --version
  "$NVHPC_NVCC" --version
  check_size
 }
 setup_aomp() {
  echo "Preparing AOMP"
  local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb"
  # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb"
  get_and_install_deb "aomp" "aomp" "$AOMP_URL"
  export_var AOMP_CXX "$PWD/aomp/usr/lib/aomp/bin/clang++"
  verify_bin_exists "$AOMP_CXX"
  "$AOMP_CXX" --version
  check_size
 }
 setup_oclcpu() {
  echo "Preparing Intel CPU OpenCL runtime"
  local tarball="oclcpuexp.tar.gz"
  local url="https://github.com/intel/llvm/releases/download/2020-12/oclcpuexp-2020.11.11.0.04_rel.tar.gz"
  # local url="http://localhost:8000/oclcpuexp-2020.11.11.0.04_rel.tar.gz"
  get_and_untar "$tarball" "$url"
  export_var OCL_LIB "$PWD/x64/libOpenCL.so"
  verify_bin_exists "$OCL_LIB"
  check_size
 }
 setup_kokkos() {
  echo "Preparing Kokkos"
  local kokkos_ver="3.3.01"
  local tarball="kokkos-$kokkos_ver.tar.gz"
  local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz"
  # local url="http://localhost:8000/$kokkos_ver.tar.gz"
  get_and_untar "$tarball" "$url"
  export_var KOKKOS_SRC "$PWD/kokkos-$kokkos_ver"
  verify_dir_exists "$KOKKOS_SRC"
  check_size
 }
 setup_raja() {
  echo "Preparing RAJA"
  local raja_ver="0.13.0"
  local tarball="raja-$raja_ver.tar.gz"
  local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz"
  # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz"
  get_and_untar "$tarball" "$url"
  export_var RAJA_SRC "$PWD/RAJA-v$raja_ver"
  verify_dir_exists "$RAJA_SRC"
  check_size
 }
 setup_clang_gcc() {
  echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
  sudo apt-get update -qq
  sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10
  export_var GCC_CXX "$(which g++-10)"
  verify_bin_exists "$GCC_CXX"
  "$GCC_CXX" --version
  export_var GCC_STD_PAR_LIB "tbb"
  export_var GCC_OMP_OFFLOAD_AMD true
  export_var GCC_OMP_OFFLOAD_NVIDIA true
  clang++ --version
  export_var CLANG_CXX "$(which clang++)"
  verify_bin_exists "$CLANG_CXX"
  "$CLANG_CXX" --version
  export_var CLANG_STD_PAR_LIB "tbb"
  export_var CLANG_OMP_OFFLOAD_AMD false
  export_var CLANG_OMP_OFFLOAD_NVIDIA false
  check_size
 }
 setup_rocm() {
  wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
  echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
  sudo apt-get update -qq
  sudo apt-get install -y -qq rocm-dev
  # AMD needs this rocm_path thing exported...
  export_var ROCM_PATH "/opt/rocm-4.1.0"
  export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
  verify_bin_exists "$HIP_CXX"
  "$HIP_CXX" --version
  check_size
 }
 setup_dpcpp() {
  local nightly="20210106"
  local tarball="dpcpp-$nightly.tar.gz"
  local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz"
  # local url="http://localhost:8000/dpcpp-compiler.tar.gz"
  get_and_untar "$tarball" "$url"
  export_var DPCPP_DIR "$PWD/dpcpp_compiler/"
  verify_dir_exists "$DPCPP_DIR"
  "$DPCPP_DIR/bin/clang++" --version
  check_size
 }
 setup_hipsycl() {
  sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
  local hipsycl_ver="0.9.0"
  local tarball="v$hipsycl_ver.tar.gz"
  local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"
  local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz"
  # local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz"
  get_and_untar "$tarball" "$url"
  if [ "$SETUP" = true ]; then
    local src="$PWD/hipSYCL-$hipsycl_ver"
    rm -rf "$src/build"
    rm -rf "$install_dir"
    cmake "-B$src/build" "-H$src" \
      -DCMAKE_C_COMPILER="$(which gcc-10)" \
      -DCMAKE_CXX_COMPILER="$(which g++-10)" \
      -DCMAKE_INSTALL_PREFIX="$install_dir" \
      -DWITH_ROCM_BACKEND=OFF \
      -DWITH_CUDA_BACKEND=OFF \
      -DWITH_CPU_BACKEND=ON
    cmake --build "$src/build" --target install -j "$(nproc)"
  fi
  export_var HIPSYCL_DIR "$install_dir"
  verify_dir_exists "$HIPSYCL_DIR"
  # note: this will forward --version to the default compiler so it won't say anything about hipsycl
  "$HIPSYCL_DIR/bin/syclcc-clang" --version
  check_size
 }
 setup_computecpp() {
  echo "TODO ComputeCpp requires registration+login to download"
 }
 if [ "${GITHUB_ACTIONS:-false}" = true ]; then
  echo "Running in GitHub Actions, defaulting to special export"
  TERM=xterm
  export TERM=xterm
  if [ "$SETUP" = true ]; then
    echo "Deleting extra packages for space in 5 seconds..."
    sleep 5
    echo "Starting apt-get remove:"
    sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel
    sudo apt-get autoremove -y
    check_size
  fi
 else
  echo "Running locally, defaulting to standard export"
 fi
 setup_cmake() {
  echo "Preparing CMake"
  local cmake_release="https://github.com/Kitware/CMake/releases/download"
  get "cmake-3.13.sh" "$cmake_release/v3.13.4/cmake-3.13.4-Linux-x86_64.sh"
  chmod +x "./cmake-3.13.sh" && sh "./cmake-3.13.sh" --skip-license --include-subdir
  export_var CMAKE_3_13_BIN "$PWD/cmake-3.13.4-Linux-x86_64/bin/cmake"
  verify_bin_exists "$CMAKE_3_13_BIN"
  "$CMAKE_3_13_BIN" --version
  get "cmake-3.15.sh" "$cmake_release/v3.15.7/cmake-3.15.7-Linux-x86_64.sh"
  chmod +x "./cmake-3.15.sh" && "./cmake-3.15.sh" --skip-license --include-subdir
  export_var CMAKE_3_15_BIN "$PWD/cmake-3.15.7-Linux-x86_64/bin/cmake"
  verify_bin_exists "$CMAKE_3_15_BIN"
  "$CMAKE_3_15_BIN" --version
  get "cmake-3.18.sh" "$cmake_release/v3.18.6/cmake-3.18.6-Linux-x86_64.sh"
  chmod +x "./cmake-3.18.sh" && "./cmake-3.18.sh" --skip-license --include-subdir
  export_var CMAKE_3_18_BIN "$PWD/cmake-3.18.6-Linux-x86_64/bin/cmake"
  verify_bin_exists "$CMAKE_3_18_BIN"
  "$CMAKE_3_18_BIN" --version
  check_size
 }
 if [ "$PARALLEL" = true ]; then
  (setup_clang_gcc && setup_rocm && setup_hipsycl) &   # these need apt so run sequentially
  setup_cmake &
  setup_oclcpu &
  setup_aocc &
  setup_nvhpc &
  setup_aomp &
  setup_dpcpp &
  setup_kokkos &
  setup_raja &
  wait
 else
  setup_cmake
  setup_aocc
  setup_oclcpu
  setup_nvhpc
  setup_aomp
  setup_dpcpp
  setup_kokkos
  setup_raja
  # these need apt
  setup_clang_gcc
  setup_rocm
  setup_hipsycl
 fi
 echo "Done!"
 cd "$PREV_DIR" || exit 1
--- a/ci-test-compile.sh
+++ b/ci-test-compile.sh
@ -0,0 +1,282 @@
 #!/usr/bin/env bash
 set -eu
 # prevent ccache from caching anything for system compilers
 export CCACHE_DISABLE=1
 BUILD_DIR=${1:-build}
 COMPILER=${2:-all}
 MODEL=${3:-all}
 CMAKE_BIN=${4}
 LOG_DIR="$BUILD_DIR"
 mkdir -p "$LOG_DIR"
 if [ "${GITHUB_ACTIONS:-false}" = true ]; then
  echo "Running in GitHub Actions, setting TERM..."
  TERM=xterm
  export TERM=xterm
 fi
 function_exists() {
  declare -f -F "$1" >/dev/null
  return $?
 }
 run_build() {
  local key="$1"
  local grep_kw="$2"
  local model="$3"
  local flags="$4"
  if [ "$MODEL" != "all" ] && [ "$MODEL" != "$model" ]; then
    echo "Skipping -DMODEL=$model $flags"
    return 0
  fi
  local log="$LOG_DIR/${model}_${key}.log"
  rm -f "$log"
  touch "$log"
  local build="$BUILD_DIR/${model}_${key}"
  rm -rf "$build"
  set +e
  # shellcheck disable=SC2086
  "$CMAKE_BIN" -B"$build" -H. \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_VERBOSE_MAKEFILE=ON \
    -DMODEL="$model" $flags &>>"$log"
  local model_lower=$(echo "$model" | awk '{print tolower($0)}')
  local cmake_code=$?
  "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
  local cmake_code=$?
  set -e
  local bin="./$build/$model_lower-stream"
  echo "Checking for final executable: $bin"
  if [[ -f "$bin" ]]; then
    echo "$(tput setaf 2)[PASS!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
    # shellcheck disable=SC2002
    cat "$log" | sed '/^--/d' | grep -i "/bin/nvcc" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "$grep_kw" | sed 's/^/    /'
    cat "$log" | sed '/^--/d' | grep -i "warning" | sed "s/.*/    $(tput setaf 3)&$(tput sgr0)/"
  else
    echo "$(tput setaf 1)[FAIL!]($model->$build)$(tput sgr0): -DMODEL=$model $flags"
    echo "      $(tput setaf 1)CMake exited with code $cmake_code, see full build log at $log, reproduced below:$(tput sgr0)"
    cat "$log"
    exit 1
  fi
  echo "    $(tput setaf 4)$(file "$bin")$(tput sgr0)"
 }
 ###
 #KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
 #RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
 #
 #GCC_CXX="/usr/bin/g++"
 #CLANG_CXX="/usr/bin/clang++"
 #
 #NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
 #NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
 #NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
 #NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
 #"$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
 #
 #AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
 #AOMP_CXX="/usr/lib/aomp/bin/clang++"
 #OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
 #
 ## AMD needs this rocm_path thing exported...
 #export ROCM_PATH="/opt/rocm-4.0.0"
 #HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
 #COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
 #DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
 #HIPSYCL_DIR="/opt/hipsycl/cff515c/"
 #
 #ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
 #ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
 #
 #GCC_STD_PAR_LIB="tbb"
 #CLANG_STD_PAR_LIB="tbb"
 #GCC_OMP_OFFLOAD_AMD=false
 #GCC_OMP_OFFLOAD_NVIDIA=true
 #CLANG_OMP_OFFLOAD_AMD=false
 #CLANG_OMP_OFFLOAD_NVIDIA=false
 ###
 AMD_ARCH="gfx_903"
 NV_ARCH="sm_70"
 NV_ARCH_CCXY="cuda11.2,cc80"
 build_gcc() {
  local name="gcc_build"
  local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
  run_build $name "${GCC_CXX:?}" OMP "$cxx"
  if [ "$MODEL" = "all" ] || [ "$MODEL" = "OMP" ]; then
    # sanity check that it at least runs
    echo "Sanity checking GCC OMP build..."
    "./$BUILD_DIR/OMP_$name/omp-stream" -s 1048576 -n 10
  fi
  # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
  run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
  if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
    run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
    run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
  fi
  if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
    run_build "nvidia_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
    run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
  fi
  run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
  run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
  run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
  #  run_build $name "${CC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
  run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
  -DENABLE_CUDA=ON \
  -DTARGET=NVIDIA \
  -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
  -DCUDA_ARCH=$NV_ARCH"
 }
 build_clang() {
  local name="clang_build"
  local cxx="-DCMAKE_CXX_COMPILER=${CLANG_CXX:?}"
  run_build $name "${CLANG_CXX:?}" OMP "$cxx"
  if [ "${CLANG_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
    run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
  fi
  if [ "${CLANG_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
    run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
  fi
  run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
  run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
  run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
  run_build $name "${CLANG_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
  run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
  # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
  run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  # no clang /w RAJA+cuda because it needs nvcc which needs gcc
 }
 build_nvhpc() {
  local name="nvhpc_build"
  local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
  run_build $name "${NVHPC_NVCXX:?}" STD "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
  run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
  run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
 }
 build_aocc() {
  run_build aocc_build "${AOCC_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}"
 }
 build_aomp() {
  run_build aomp_amd_build "${AOMP_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=AMD:gfx906"
  #run_build aomp_nvidia_build "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=NVIDIA:$NV_ARCH"
 }
 build_hip() {
  run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
 }
 build_icpx() {
  # clang derived
  set +u
  source /opt/intel/oneapi/setvars.sh -force || true
  set -u
  run_build intel_build "${ICPX_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${ICPX_CXX:?} -DOFFLOAD=INTEL"
 }
 build_icpc() {
  # icc/icpc
  set +u
  source /opt/intel/oneapi/setvars.sh -force || true
  set -u
  local name="intel_build"
  local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
  run_build $name "${ICPC_CXX:?}" OMP "$cxx"
  run_build $name "${ICPC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
  run_build $name "${ICPC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
  run_build $name "${ICPC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
 }
 build_computecpp() {
  run_build computecpp_build "compute++" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
  -DSYCL_COMPILER=COMPUTECPP \
  -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
  -DOpenCL_LIBRARY=${OCL_LIB:?}"
 }
 build_dpcpp() {
  run_build intel_build "${DPCPP_DIR:?}" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
  -DSYCL_COMPILER=DPCPP \
  -DSYCL_COMPILER_DIR=${DPCPP_DIR:?}"
  #  for oneAPI BaseKit:
  #  source /opt/intel/oneapi/setvars.sh -force
  #  run_build intel_build "dpcpp" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP"
 }
 build_hipsycl() {
  run_build hipsycl_build "syclcc" SYCL "
  -DSYCL_COMPILER=HIPSYCL \
  -DSYCL_COMPILER_DIR=${HIPSYCL_DIR:?}"
 }
 echo "Test compiling with ${COMPILER} CXX for ${MODEL} model"
 "$CMAKE_BIN" --version
 case "$COMPILER" in
 gcc) build_gcc ;;
 clang) build_clang ;;
 nvhpc) build_nvhpc ;;
 aocc) build_aocc ;;
 aomp) build_aomp ;;
 hip) build_hip ;;
 dpcpp) build_dpcpp ;;
 hipsycl) build_hipsycl ;;
 # XXX below are local only; licence or very large download required, candidate for local runner
 computecpp) build_computecpp ;;
 icpx) build_icpx ;;
 icpc) build_icpc ;;
 all)
  build_gcc
  build_clang
  build_nvhpc
  build_aocc
  build_aomp
  build_hip
  build_dpcpp
  build_hipsycl
  build_computecpp
  build_icpx
  build_icpc
  ;;
 *)
  echo "Unknown $COMPILER, use ALL to compile with all supported compilers"
  ;;
 esac
--- a/cmake/Modules/ComputeCppCompilerChecks.cmake
+++ b/cmake/Modules/ComputeCppCompilerChecks.cmake
@ -0,0 +1,65 @@
 cmake_minimum_required(VERSION 3.4.3)
 if(CMAKE_COMPILER_IS_GNUCXX)
  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
    message(FATAL_ERROR "host compiler - gcc version must be > 4.8")
  endif()
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
    message(FATAL_ERROR "host compiler - clang version must be > 3.6")
  endif()
 endif()
 if(MSVC)
  set(ComputeCpp_STL_CHECK_SRC __STL_check)
  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
    "#include <CL/sycl.hpp>  \n"
    "int main() { return 0; }\n")
  set(_stl_test_command ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
                        -sycl
                        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
                        -isystem ${ComputeCpp_INCLUDE_DIRS}
                        -isystem ${OpenCL_INCLUDE_DIRS}
                        -o ${ComputeCpp_STL_CHECK_SRC}.sycl
                        -c ${ComputeCpp_STL_CHECK_SRC}.cpp)
  execute_process(
    COMMAND ${_stl_test_command}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
    RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
    ERROR_QUIET
    OUTPUT_QUIET)
  if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
    # Try disabling compiler version checks
    execute_process(
      COMMAND ${_stl_test_command}
              -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
      RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
      ERROR_QUIET
      OUTPUT_QUIET)
    if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
      # Try again with __CUDACC__ and _HAS_CONDITIONAL_EXPLICIT=0. This relaxes the restritions in the MSVC headers
      execute_process(
        COMMAND ${_stl_test_command}
                -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
                -D_HAS_CONDITIONAL_EXPLICIT=0
                -D__CUDACC__
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
        RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
        ERROR_QUIET
        OUTPUT_QUIET)
        if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
          message(FATAL_ERROR "compute++ cannot consume hosted STL headers. This means that compute++ can't \
                               compile a simple program in this platform and will fail when used in this system.")
        else()
          list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
                                                       -D_HAS_CONDITIONAL_EXPLICIT=0
                                                       -D__CUDACC__)
        endif()
    else()
      list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH)
    endif()
  endif()
  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
              ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl)
 endif(MSVC)
--- a/cmake/Modules/ComputeCppIRMap.cmake
+++ b/cmake/Modules/ComputeCppIRMap.cmake
@ -0,0 +1,18 @@
 cmake_minimum_required(VERSION 3.4.3)
 # These should match the types of IR output by compute++
 set(IR_MAP_spir bc)
 set(IR_MAP_spir64 bc)
 set(IR_MAP_spir32 bc)
 set(IR_MAP_spirv spv)
 set(IR_MAP_spirv64 spv)
 set(IR_MAP_spirv32 spv)
 set(IR_MAP_aorta-x86_64 o)
 set(IR_MAP_aorta-aarch64 o)
 set(IR_MAP_aorta-rcar-cve o)
 set(IR_MAP_custom-spir64 bc)
 set(IR_MAP_custom-spir32 bc)
 set(IR_MAP_custom-spirv64 spv)
 set(IR_MAP_custom-spirv32 spv)
 set(IR_MAP_ptx64 s)
 set(IR_MAP_amdgcn s)
--- a/cmake/Modules/FindComputeCpp.cmake
+++ b/cmake/Modules/FindComputeCpp.cmake
@ -0,0 +1,454 @@
 #.rst:
 # FindComputeCpp
 #---------------
 #
 #   Copyright 2016-2018 Codeplay Software Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use these files except in compliance with the License.
 #   You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 #########################
 #  FindComputeCpp.cmake
 #########################
 #
 #  Tools for finding and building with ComputeCpp.
 #
 #  User must define ComputeCpp_DIR pointing to the ComputeCpp
 #  installation.
 #
 #  Latest version of this file can be found at:
 #    https://github.com/codeplaysoftware/computecpp-sdk
 cmake_minimum_required(VERSION 3.4.3)
 include(FindPackageHandleStandardArgs)
 include(ComputeCppIRMap)
 set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++")
 separate_arguments(COMPUTECPP_USER_FLAGS)
 mark_as_advanced(COMPUTECPP_USER_FLAGS)
 set(COMPUTECPP_BITCODE "spir64" CACHE STRING
  "Bitcode type to use as SYCL target in compute++")
 mark_as_advanced(COMPUTECPP_BITCODE)
 find_package(OpenCL REQUIRED)
 # Find ComputeCpp package
 if(DEFINED ComputeCpp_DIR)
  set(computecpp_find_hint ${ComputeCpp_DIR})
 elseif(DEFINED ENV{COMPUTECPP_DIR})
  set(computecpp_find_hint $ENV{COMPUTECPP_DIR})
 endif()
 # Used for running executables on the host
 set(computecpp_host_find_hint ${computecpp_find_hint})
 if(CMAKE_CROSSCOMPILING)
  # ComputeCpp_HOST_DIR is used to find executables that are run on the host
  if(DEFINED ComputeCpp_HOST_DIR)
    set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR})
  elseif(DEFINED ENV{COMPUTECPP_HOST_DIR})
    set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR})
  endif()
 endif()
 find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++
  HINTS ${computecpp_host_find_hint}
  PATH_SUFFIXES bin
  NO_SYSTEM_ENVIRONMENT_PATH)
 find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info
  HINTS ${computecpp_host_find_hint}
  PATH_SUFFIXES bin
  NO_SYSTEM_ENVIRONMENT_PATH)
 find_library(COMPUTECPP_RUNTIME_LIBRARY
  NAMES ComputeCpp ComputeCpp_vs2015
  HINTS ${computecpp_find_hint}
  PATH_SUFFIXES lib
  DOC "ComputeCpp Runtime Library")
 find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG
  NAMES ComputeCpp_d ComputeCpp ComputeCpp_vs2015_d
  HINTS ${computecpp_find_hint}
  PATH_SUFFIXES lib
  DOC "ComputeCpp Debug Runtime Library")
 find_path(ComputeCpp_INCLUDE_DIRS
  NAMES "CL/sycl.hpp"
  HINTS ${computecpp_find_hint}/include
  DOC "The ComputeCpp include directory")
 get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE)
 get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE)
 set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH
    "The root of the ComputeCpp install")
 if(NOT ComputeCpp_INFO_EXECUTABLE)
  message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR")
 else()
  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version"
    OUTPUT_VARIABLE ComputeCpp_VERSION
    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
    message(WARNING "Package version - Error obtaining version!")
  endif()
  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported"
    OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
    message(WARNING "platform - Error checking platform support!")
  else()
    mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
    if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
      message(STATUS "platform - your system can support ComputeCpp")
    else()
      message(STATUS "platform - your system is not officially supported")
    endif()
  endif()
 endif()
 find_package_handle_standard_args(ComputeCpp
  REQUIRED_VARS ComputeCpp_ROOT_DIR
                ComputeCpp_DEVICE_COMPILER_EXECUTABLE
                ComputeCpp_INFO_EXECUTABLE
                COMPUTECPP_RUNTIME_LIBRARY
                COMPUTECPP_RUNTIME_LIBRARY_DEBUG
                ComputeCpp_INCLUDE_DIRS
  VERSION_VAR ComputeCpp_VERSION)
 mark_as_advanced(ComputeCpp_ROOT_DIR
                 ComputeCpp_DEVICE_COMPILER_EXECUTABLE
                 ComputeCpp_INFO_EXECUTABLE
                 COMPUTECPP_RUNTIME_LIBRARY
                 COMPUTECPP_RUNTIME_LIBRARY_DEBUG
                 ComputeCpp_INCLUDE_DIRS
                 ComputeCpp_VERSION)
 if(NOT ComputeCpp_FOUND)
  return()
 endif()
 list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata)
 mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS)
 if(CMAKE_CROSSCOMPILING)
  if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN)
    list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR})
  endif()
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR})
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE})
 endif()
 list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE})
 message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 include(ComputeCppCompilerChecks)
 if(NOT TARGET OpenCL::OpenCL)
  add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
  set_target_properties(OpenCL::OpenCL PROPERTIES
    IMPORTED_LOCATION             "${OpenCL_LIBRARIES}"
    INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}"
  )
 endif()
 if(NOT TARGET ComputeCpp::ComputeCpp)
  add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED)
  set_target_properties(ComputeCpp::ComputeCpp PROPERTIES
    IMPORTED_LOCATION_DEBUG          "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
    IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY}"
    IMPORTED_LOCATION                "${COMPUTECPP_RUNTIME_LIBRARY}"
    INTERFACE_INCLUDE_DIRECTORIES    "${ComputeCpp_INCLUDE_DIRS}"
    INTERFACE_LINK_LIBRARIES         "OpenCL::OpenCL"
  )
 endif()
 # This property allows targets to specify that their sources should be
 # compiled with the integration header included after the user's
 # sources, not before (e.g. when an enum is used in a kernel name, this
 # is not technically valid SYCL code but can work with ComputeCpp)
 define_property(
  TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER
  BRIEF_DOCS "Include integration header after user source"
  FULL_DOCS "Changes compiler arguments such that the source file is
  actually the integration header, and the .cpp file is included on
  the command line so that it is seen by the compiler first. Enables
  non-standards-conformant SYCL code to compile with ComputeCpp."
 )
 define_property(
  TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS
  BRIEF_DOCS "Interface compile flags to provide compute++"
  FULL_DOCS  "Set additional compile flags to pass to compute++ when compiling
  any target which links to this one."
 )
 define_property(
  SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS
  BRIEF_DOCS "Source file compile flags for compute++"
  FULL_DOCS  "Set additional compile flags for compiling the SYCL integration
  header for the given source file."
 )
 ####################
 #   __build_ir
 ####################
 #
 #  Adds a custom target for running compute++ and adding a dependency for the
 #  resulting integration header and kernel binary.
 #
 #  TARGET : Name of the target.
 #  SOURCE : Source file to be compiled.
 #  COUNTER : Counter included in name of custom target. Different counter
 #       values prevent duplicated names of custom target when source files with
 #       the same name, but located in different directories, are used for the
 #       same target.
 #
 function(__build_ir)
  set(options)
  set(one_value_args
    TARGET
    SOURCE
    COUNTER
  )
  set(multi_value_args)
  cmake_parse_arguments(SDK_BUILD_IR
    "${options}"
    "${one_value_args}"
    "${multi_value_args}"
    ${ARGN}
  )
  get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME)
  # Set the path to the integration header.
  # The .sycl filename must depend on the target so that different targets
  # using the same source file will be generated with a different rule.
  set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName})
  set(outputSyclFile ${baseSyclName}.sycl)
  set(outputDeviceFile ${baseSyclName}.${IR_MAP_${COMPUTECPP_BITCODE}})
  set(depFileName ${baseSyclName}.sycl.d)
  set(include_directories "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},INCLUDE_DIRECTORIES>")
  set(compile_definitions "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},COMPILE_DEFINITIONS>")
  set(generated_include_directories
    $<$<BOOL:${include_directories}>:-I\"$<JOIN:${include_directories},\"\t-I\">\">)
  set(generated_compile_definitions
    $<$<BOOL:${compile_definitions}>:-D$<JOIN:${compile_definitions},\t-D>>)
  # Obtain language standard of the file
  set(device_compiler_cxx_standard)
  get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD)
  if (targetCxxStandard MATCHES 17)
    set(device_compiler_cxx_standard "-std=c++1z")
  elseif (targetCxxStandard MATCHES 14)
    set(device_compiler_cxx_standard "-std=c++14")
  elseif (targetCxxStandard MATCHES 11)
    set(device_compiler_cxx_standard "-std=c++11")
  elseif (targetCxxStandard MATCHES 98)
    message(FATAL_ERROR "SYCL applications cannot be compiled using C++98")
  else ()
    set(device_compiler_cxx_standard "")
  endif()
  get_property(source_compile_flags
    SOURCE ${SDK_BUILD_IR_SOURCE}
    PROPERTY COMPUTECPP_SOURCE_FLAGS
  )
  separate_arguments(source_compile_flags)
  if(source_compile_flags)
    list(APPEND computecpp_source_flags ${source_compile_flags})
  endif()
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS
    ${device_compiler_cxx_standard}
    ${COMPUTECPP_USER_FLAGS}
    ${computecpp_source_flags}
  )
  set(ir_dependencies ${SDK_BUILD_IR_SOURCE})
  get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES)
  if(target_libraries)
    foreach(library ${target_libraries})
      if(TARGET ${library})
        list(APPEND ir_dependencies ${library})
      endif()
    endforeach()
  endif()
  # Depfile support was only added in CMake 3.7
  # CMake throws an error if it is unsupported by the generator (i. e. not ninja)
  if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND
          CMAKE_GENERATOR MATCHES "Ninja")
    file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputDeviceFile})
    set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile})
    set(enable_depfile DEPFILE ${depFileName})
  endif()
  # Add custom command for running compute++
  add_custom_command(
    OUTPUT ${outputDeviceFile} ${outputSyclFile}
    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
            ${generated_include_directories}
            ${generated_compile_definitions}
            -sycl-ih ${outputSyclFile}
            -o ${outputDeviceFile}
            -c ${SDK_BUILD_IR_SOURCE}
            ${generate_depfile}
    DEPENDS ${ir_dependencies}
    IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE}
    ${enable_depfile}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
    COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
  # Name: (user-defined name)_(source file)_(counter)_ih
  set(headerTargetName
    ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih)
  if(NOT MSVC)
    # Add a custom target for the generated integration header
    add_custom_target(${headerTargetName} DEPENDS ${outputDeviceFile} ${outputSyclFile})
    add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName})
  endif()
  # This property can be set on a per-target basis to indicate that the
  # integration header should appear after the main source listing
  get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
  if(includeAfter)
    # Change the source file to the integration header - e.g.
    # g++ -c source_file_name.cpp.sycl
    get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES)
    # Remove absolute path to source file
    list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE})
    # Remove relative path to source file
    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" ""
      rel_source_file ${SDK_BUILD_IR_SOURCE}
    )
    list(REMOVE_ITEM current_sources ${rel_source_file})
    # Add SYCL header to source list
    list(APPEND current_sources ${outputSyclFile})
    set_property(TARGET ${SDK_BUILD_IR_TARGET}
      PROPERTY SOURCES ${current_sources})
    # CMake/gcc don't know what language a .sycl file is, so tell them
    set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX)
    set(includedFile ${SDK_BUILD_IR_SOURCE})
    set(cppFile ${outputSyclFile})
  else()
    set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON)
    set(includedFile ${outputSyclFile})
    set(cppFile ${SDK_BUILD_IR_SOURCE})
  endif()
  # Force inclusion of the integration header for the host compiler
  if(MSVC)
    # Group SYCL files inside Visual Studio
    source_group("SYCL" FILES ${outputSyclFile})
    if(includeAfter)
      # Allow the source file to be edited using Visual Studio.
      # It will be added as a header file so it won't be compiled.
      set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true)
    endif()
    # Add both source and the sycl files to the VS solution.
    target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile})
    set(forceIncludeFlags "/FI${includedFile} /TP")
  else()
    set(forceIncludeFlags "-include ${includedFile} -x c++")
  endif()
  set_property(
    SOURCE ${cppFile}
    APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}"
  )
 endfunction(__build_ir)
 #######################
 #  add_sycl_to_target
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
 #  target and sets a dependancy on that new command.
 #
 #  TARGET : Name of the target to add SYCL to.
 #  SOURCES : Source files to be compiled for SYCL.
 #
 function(add_sycl_to_target)
  set(options)
  set(one_value_args
    TARGET
  )
  set(multi_value_args
    SOURCES
  )
  cmake_parse_arguments(SDK_ADD_SYCL
    "${options}"
    "${one_value_args}"
    "${multi_value_args}"
    ${ARGN}
  )
  set_target_properties(${SDK_ADD_SYCL_TARGET} PROPERTIES LINKER_LANGUAGE CXX)
  # If the CXX compiler is set to compute++ enable the driver.
  get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME)
  if("${cmakeCxxCompilerFileName}" STREQUAL "compute++")
    if(MSVC)
      message(FATAL_ERROR "The compiler driver is not supported by this system,
                           revert the CXX compiler to your default host compiler.")
    endif()
    get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
    if(includeAfter)
      list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last)
    endif()
    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver)
    # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS
    foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS)
      get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop})
      if(NOT target_compile_options)
        set(target_compile_options "")
      endif()
      set_property(
        TARGET ${SDK_ADD_SYCL_TARGET}
        PROPERTY ${prop}
        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
        ${target_compile_options}
        ${COMPUTECPP_USER_FLAGS}
      )
    endforeach()
  else()
    set(fileCounter 0)
    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl)
    # Add custom target to run compute++ and generate the integration header
    foreach(sourceFile ${SDK_ADD_SYCL_SOURCES})
      if(NOT IS_ABSOLUTE ${sourceFile})
        set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}")
      endif()
      __build_ir(
        TARGET     ${SDK_ADD_SYCL_TARGET}
        SOURCE     ${sourceFile}
        COUNTER    ${fileCounter}
      )
      MATH(EXPR fileCounter "${fileCounter} + 1")
    endforeach()
  endif()
  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
    APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp)
  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
    APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp)
 endfunction(add_sycl_to_target)
--- a/cmake/toolchains/arm-gcc-poky.cmake
+++ b/cmake/toolchains/arm-gcc-poky.cmake
@ -0,0 +1,34 @@
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR ARM64)
 set(SDK_POKY_ROOT $ENV{SDK_POKY_ROOT})
 if(NOT SDK_POKY_ROOT)
  message(FATAL_ERROR
    "Please set SDK_POKY_ROOT in the environment when crosscompiling.")
 endif()
 set(COMPUTECPP_TARGET_TRIPLE aarch64-poky-linux)
 set(COMPUTECPP_TOOLCHAIN_DIR ${SDK_POKY_ROOT}/x86_64-pokysdk-linux)
 set(COMPUTECPP_SYSROOT_DIR ${SDK_POKY_ROOT}/aarch64-poky-linux)
 # Adding this as the GCC toolchain makes compute++ not find headers
 set(COMPUTECPP_DONT_USE_TOOLCHAIN ON)
 set(CMAKE_C_COMPILER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-gcc" CACHE PATH "gcc")
 set(CMAKE_CXX_COMPILER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-g++" CACHE PATH "g++")
 set(CMAKE_AR "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ar" CACHE PATH "archive")
 set(CMAKE_LINKER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ld" CACHE PATH "linker")
 set(CMAKE_NM "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-nm" CACHE PATH "nm")
 set(CMAKE_OBJCOPY "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-objcopy" CACHE PATH "objcopy")
 set(CMAKE_OBJDUMP "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-objdump" CACHE PATH "objdump")
 set(CMAKE_STRIP "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-strip" CACHE PATH "strip")
 set(CMAKE_RANLIB "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ranlib" CACHE PATH "ranlib")
 set(CMAKE_FIND_ROOT_PATH ${COMPUTECPP_SYSROOT_DIR})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_SYSROOT "${COMPUTECPP_SYSROOT_DIR}")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__aarch64__ --sysroot=${COMPUTECPP_SYSROOT_DIR}" CACHE INTERNAL "")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__aarch64__ --sysroot=${COMPUTECPP_SYSROOT_DIR}" CACHE INTERNAL "")
 set(CMAKE_CXX_LINK_EXECUTABLE "<CMAKE_CXX_COMPILER> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" CACHE INTERNAL "")
--- a/cmake/toolchains/gcc-generic.cmake
+++ b/cmake/toolchains/gcc-generic.cmake
@ -0,0 +1,18 @@
 set(CMAKE_SYSTEM_NAME Linux)
 set(COMPUTECPP_SYSROOT_DIR $ENV{COMPUTECPP_SYSROOT_DIR})
 set(COMPUTECPP_TOOLCHAIN_DIR $ENV{COMPUTECPP_TOOLCHAIN_DIR})
 set(COMPUTECPP_TARGET_TRIPLE $ENV{COMPUTECPP_TARGET_TRIPLE})
 if(NOT COMPUTECPP_SYSROOT_DIR OR
  NOT COMPUTECPP_TOOLCHAIN_DIR OR
  NOT COMPUTECPP_TARGET_TRIPLE
 )
  message(FATAL_ERROR
    "Please set all of COMPUTECPP_TARGET_TRIPLE, COMPUTECPP_SYSROOT_DIR and "
    "COMPUTECPP_TOOLCHAIN_DIR in the environment when crosscompiling.")
 endif()
 set(CMAKE_SYSROOT ${COMPUTECPP_SYSROOT_DIR})
 set(CMAKE_C_COMPILER ${COMPUTECPP_TOOLCHAIN_DIR}/bin/${COMPUTECPP_TARGET_TRIPLE}-gcc)
 set(CMAKE_CXX_COMPILER ${COMPUTECPP_TOOLCHAIN_DIR}/bin/${COMPUTECPP_TARGET_TRIPLE}-g++)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
--- a/legacy/HC.make
+++ b/legacy/HC.make
@ -13,7 +13,7 @@ CXXFLAGS+=-DNTILES=$(TBSIZE)
 endif
-hc-stream: main.cpp HCStream.cpp
+hc-stream: ../main.cpp HCStream.cpp
 	$(HCC) $(CXXFLAGS) -DHC  $^  $(LDFLAGS) $(EXTRA_FLAGS) -o $@
 .PHONY: clean
--- a/legacy/HCStream.cpp
+++ b/legacy/HCStream.cpp
--- a/legacy/HCStream.h
+++ b/legacy/HCStream.h
--- a/register_models.cmake
+++ b/register_models.cmake
@ -0,0 +1,146 @@
 #function(switch_expr INPUT OUTPUT)
 #    list(LENGTH ARGN N)
 #    math(EXPR EVEN "${N} % 2")
 #    if (NOT EVEN EQUAL 0)
 #        message(FATAL_ERROR "Expr must be a list of string pairs, ${EVEN}")
 #    endif ()
 #    math(EXPR N_ "${N}-1")
 #    foreach (idx RANGE 0 ${N_} 2)
 #        math(EXPR KEY_IDX "${idx} + 0")
 #        math(EXPR VALUE_IDX "${idx} + 1")
 #        list(GET ARGN ${KEY_IDX} KEY)
 #        list(GET ARGN ${VALUE_IDX} VALUE)
 #        if (${KEY} STREQUAL ${INPUT})
 #            set(${OUTPUT} ${VALUE} PARENT_SCOPE)
 #            break()
 #        endif ()
 #    endforeach ()
 #endfunction()
 #
 macro(wipe_gcc_style_optimisation_flags VAR)
    string(REGEX REPLACE "([\\/\\-]O.)" "" ${VAR} ${${VAR}})
 endmacro()
 macro(register_link_library)
    list(APPEND LINK_LIBRARIES ${ARGN})
 endmacro()
 macro(register_append_cxx_flags CONFIG)
    if ("${CONFIG}" STREQUAL "RELEASE" OR "${CONFIG}" STREQUAL "ANY")
        list(APPEND DEFAULT_RELEASE_FLAGS ${ARGN})
    elseif ("${CONFIG}" STREQUAL "DEBUG" OR "${CONFIG}" STREQUAL "ANY")
        list(APPEND DEFAULT_DEBUG_FLAGS ${ARGN})
    else ()
        message(FATAL_ERROR "register_flags supports only RELEASE, DEBUG, or ANY for all configs, got `${CONFIG}`")
    endif ()
 endmacro()
 macro(register_append_link_flags)
    list(APPEND LINK_FLAGS ${ARGN})
 endmacro()
 macro(register_append_compiler_and_arch_specific_cxx_flags PREFIX CXX ARCH)
    string(TOUPPER ${CXX} _CXX)
    string(TOUPPER ${ARCH} _ARCH)
    set(_CXX_ARCH_SPECIFIC_FLAGS "${${PREFIX}_${_CXX}_${_ARCH}}")
    if (_CXX_ARCH_SPECIFIC_FLAGS)
        register_append_cxx_flags(ANY ${_CXX_ARCH_SPECIFIC_FLAGS})
    endif ()
    set(_CXX_ARCH_SPECIFIC_FLAGS "${${PREFIX}_${_CXX}}")
    if (_CXX_ARCH_SPECIFIC_FLAGS)
        register_append_cxx_flags(ANY ${_CXX_ARCH_SPECIFIC_FLAGS})
    endif ()
 endmacro()
 macro(register_definitions)
    list(APPEND IMPL_DEFINITIONS ${ARGN})
 endmacro()
 macro(register_flag_required NAME DESCRIPTION)
    list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "")
 endmacro()
 macro(register_flag_optional NAME DESCRIPTION DEFAULT)
    list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" OFF "${DEFAULT}")
 endmacro()
 function(registered_flags_action ACTION OUT)
    list(LENGTH CUSTOM_FLAGS_TRIPLE NFLAGS)
    if (NOT NFLAGS EQUAL "0")
        if (${ACTION} STREQUAL "print")
            set(LINE "Supported flags:\n\n")
        elseif (${ACTION} STREQUAL "check")
            set(LINE "Model-specific flags for this build:\n\n")
        endif ()
        math(EXPR NFLAGS "${NFLAGS}-1")
        foreach (idx RANGE 0 ${NFLAGS} 4)
            math(EXPR NAME_IDX "${idx} + 0")
            math(EXPR DESCRIPTION_IDX "${idx} + 1")
            math(EXPR REQUIRED_IDX "${idx} + 2")
            math(EXPR DEFAULT_VALUE_IDX "${idx} + 3")
            list(GET CUSTOM_FLAGS_TRIPLE ${NAME_IDX} NAME)
            list(GET CUSTOM_FLAGS_TRIPLE ${DESCRIPTION_IDX} DESCRIPTION)
            list(GET CUSTOM_FLAGS_TRIPLE ${REQUIRED_IDX} REQUIRED)
            list(GET CUSTOM_FLAGS_TRIPLE ${DEFAULT_VALUE_IDX} DEFAULT_VALUE)
            if (${ACTION} STREQUAL "print")
                if (${REQUIRED})
                    set(DEFAULT_VALUE "(required)")
                else ()
                    set(DEFAULT_VALUE "(optional, default=${DEFAULT_VALUE})")
                endif ()
                set(LINE "${LINE}   ${NAME} ${DEFAULT_VALUE}: ${DESCRIPTION}\n")
            elseif (${ACTION} STREQUAL "check")
                if (${REQUIRED})
                    # required flag
                    if (NOT DEFINED ${NAME})
                        message(FATAL_ERROR "`${NAME}` is not set! (${DESCRIPTION})")
                    endif ()
                else ()
                    # optional flag with default
                    if (NOT DEFINED ${NAME})
                        set(${NAME} "${DEFAULT_VALUE}" PARENT_SCOPE) # setting PARENT_SCOPE does not affect local scope
                        set(${NAME} "${DEFAULT_VALUE}")
                    endif ()
                endif ()
                set(LINE "${LINE}   ${NAME} = `${${NAME}}`\n")
            else ()
                message(FATAL_ERROR "action `${ACTION}` not supported")
            endif ()
        endforeach ()
    endif ()
    set(${OUT} "${LINE}" PARENT_SCOPE)
 endfunction()
 macro(register_model NAME PREPROCESSOR_NAME)
    string(TOUPPER ${NAME} MODEL_UPPER)
    list(APPEND REGISTERED_MODELS "${NAME}")
    list(APPEND IMPL_${MODEL_UPPER}_SOURCES "${ARGN}")
    list(APPEND IMPL_${MODEL_UPPER}_DEFINITIONS "${PREPROCESSOR_NAME}")
 endmacro()
 macro(load_model MODEL)
    string(TOUPPER "${MODEL}" MODEL_UPPER)
    if ("${MODEL_UPPER}" IN_LIST REGISTERED_MODELS)
        set(MODEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${MODEL_UPPER}.cmake)
        if (NOT EXISTS ${MODEL_FILE})
            message(FATAL_ERROR "${MODEL_FILE} not found, perhaps it needs to be implemented?")
        endif ()
        include(${MODEL_FILE})
        list(APPEND IMPL_SOURCES ${IMPL_${MODEL_UPPER}_SOURCES})
        list(APPEND IMPL_DEFINITIONS ${IMPL_${MODEL_UPPER}_DEFINITIONS})
        string(TOLOWER ${MODEL} MODEL_LOWER)
        set(EXE_NAME ${MODEL_LOWER}-stream)
    else ()
        message(FATAL_ERROR "Unsupported model: ${MODEL}")
    endif ()
 endmacro()