diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 20e1034..08eed2d 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -1,98 +1,170 @@ name: CI -on: [push, pull_request] - +on: + push: + pull_request: + workflow_dispatch: + inputs: + debug_enabled: + description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' + required: false + default: false jobs: - test: + + test-rust: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./src/rust/rust-stream + steps: + - uses: actions/checkout@v2 + - name: Setup project + run: rustup install nightly + - name: Compile project + run: cargo +nightly build --release + - name: Test project + run: cargo +nightly test --release + - name: Test run project + run: ./target/release/rust-stream --arraysize 2048 + + test-java: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./src/java/java-stream + steps: + - uses: actions/checkout@v2 + - name: Test build project + run: ./mvnw clean package + - name: Test run + if: ${{ ! cancelled() }} + run: java -jar target/java-stream.jar --arraysize 2048 + + test-julia: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./src/julia/JuliaStream.jl + steps: + - uses: actions/checkout@v2 + - name: Setup project + run: julia --project -e 'import Pkg; Pkg.instantiate()' + - name: Test run PlainStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/PlainStream.jl --arraysize 2048 + - name: Test run ThreadedStream.jl + if: ${{ ! cancelled() }} + run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048 + - name: Test run DistributedStream.jl (no flag) + if: ${{ ! cancelled() }} + run: julia --project src/DistributedStream.jl --arraysize 2048 + - name: Test run DistributedStream.jl (-p 2) + if: ${{ ! cancelled() }} + run: julia -p 2 --project src/DistributedStream.jl --arraysize 2048 + - name: Test run CUDAStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/CUDAStream.jl --list + - name: Test run AMDGPUStream.jl + if: ${{ ! cancelled() }} + run: julia --project src/AMDGPUStream.jl --list + + + test-cpp: runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v2 - name: Cache compiler + if: ${{ !env.ACT }} id: prepare-compilers uses: actions/cache@v2 with: - path: compilers - key: ${{ runner.os }}-${{ hashFiles('ci-prepare-bionic.sh') }} + path: ./compilers + key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} - name: Prepare compilers if: steps.prepare-compilers.outputs.cache-hit != 'true' - run: source ./ci-prepare-bionic.sh ./compilers SETUP true || true + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true - name: Setup test environment - run: source ./ci-prepare-bionic.sh ./compilers VARS false || true + run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true + + # Enable tmate debugging of manually-triggered workflows if the input option was provided + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - name: Test compile gcc @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_13_BIN }} - name: Test compile clang @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_13_BIN }} - name: Test compile nvhpc @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_13_BIN }} - name: Test compile aocc @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_13_BIN }} - name: Test compile aomp @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_13_BIN }} - name: Test compile hip @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_13_BIN }} - name: Test compile dpcpp @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_13_BIN }} - name: Test compile hipsycl @ CMake 3.13 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_13_BIN }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_13_BIN }} - name: Test compile gcc @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_15_BIN }} - name: Test compile clang @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_15_BIN }} - name: Test compile nvhpc @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_15_BIN }} - name: Test compile aocc @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_15_BIN }} - name: Test compile aomp @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_15_BIN }} - name: Test compile hip @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_15_BIN }} - name: Test compile dpcpp @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_15_BIN }} - name: Test compile hipsycl @ CMake 3.15 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_15_BIN }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_15_BIN }} - name: Test compile gcc @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_18_BIN }} - name: Test compile clang @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_18_BIN }} - name: Test compile nvhpc @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_18_BIN }} - name: Test compile aocc @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_18_BIN }} - name: Test compile aomp @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_18_BIN }} - name: Test compile hip @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_18_BIN }} - name: Test compile dpcpp @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }} - name: Test compile hipsycl @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} \ No newline at end of file + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 31af301..012d0e8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/CHANGELOG.md b/CHANGELOG.md index 976964a..3e1040d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ All notable changes to this project will be documented in this file. - Added nstream kernel from PRK with associate command line option. - CMake build system added for all models. - SYCL device check for FP64 support. -- New implementation using TBB. +- New implementations: TBB, Thrust, Julia, Scala, Java. - Compiler options for Fujitsu added to OpenMP. ### Changed @@ -33,12 +33,14 @@ All notable changes to this project will be documented in this file. - Normalise sum result by expected value to help false negative errors. - HC version deprecated and moved to a legacy directory. - Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update). +- Update SYCL version to SYCL 2020. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. Pre-building kernels is also not required, and shows no overhead as the first iteration is not timed. - OpenACC Cray compiler flags. - Build support for Kokkos 2.x (No code changes made). +- All Makefiles; build system will now use CMake exclusively. ## [v3.4] - 2019-04-10 diff --git a/CL/cl.h b/CL/cl.h deleted file mode 100644 index f33f999..0000000 --- a/CL/cl.h +++ /dev/null @@ -1,1902 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_H -#define __OPENCL_CL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/******************************************************************************/ - -typedef struct _cl_platform_id * cl_platform_id; -typedef struct _cl_device_id * cl_device_id; -typedef struct _cl_context * cl_context; -typedef struct _cl_command_queue * cl_command_queue; -typedef struct _cl_mem * cl_mem; -typedef struct _cl_program * cl_program; -typedef struct _cl_kernel * cl_kernel; -typedef struct _cl_event * cl_event; -typedef struct _cl_sampler * cl_sampler; - -typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ -typedef cl_ulong cl_bitfield; -typedef cl_bitfield cl_device_type; -typedef cl_uint cl_platform_info; -typedef cl_uint cl_device_info; -typedef cl_bitfield cl_device_fp_config; -typedef cl_uint cl_device_mem_cache_type; -typedef cl_uint cl_device_local_mem_type; -typedef cl_bitfield cl_device_exec_capabilities; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_device_svm_capabilities; -#endif -typedef cl_bitfield cl_command_queue_properties; -#ifdef CL_VERSION_1_2 -typedef intptr_t cl_device_partition_property; -typedef cl_bitfield cl_device_affinity_domain; -#endif - -typedef intptr_t cl_context_properties; -typedef cl_uint cl_context_info; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_queue_properties; -#endif -typedef cl_uint cl_command_queue_info; -typedef cl_uint cl_channel_order; -typedef cl_uint cl_channel_type; -typedef cl_bitfield cl_mem_flags; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_svm_mem_flags; -#endif -typedef cl_uint cl_mem_object_type; -typedef cl_uint cl_mem_info; -#ifdef CL_VERSION_1_2 -typedef cl_bitfield cl_mem_migration_flags; -#endif -typedef cl_uint cl_image_info; -#ifdef CL_VERSION_1_1 -typedef cl_uint cl_buffer_create_type; -#endif -typedef cl_uint cl_addressing_mode; -typedef cl_uint cl_filter_mode; -typedef cl_uint cl_sampler_info; -typedef cl_bitfield cl_map_flags; -#ifdef CL_VERSION_2_0 -typedef intptr_t cl_pipe_properties; -typedef cl_uint cl_pipe_info; -#endif -typedef cl_uint cl_program_info; -typedef cl_uint cl_program_build_info; -#ifdef CL_VERSION_1_2 -typedef cl_uint cl_program_binary_type; -#endif -typedef cl_int cl_build_status; -typedef cl_uint cl_kernel_info; -#ifdef CL_VERSION_1_2 -typedef cl_uint cl_kernel_arg_info; -typedef cl_uint cl_kernel_arg_address_qualifier; -typedef cl_uint cl_kernel_arg_access_qualifier; -typedef cl_bitfield cl_kernel_arg_type_qualifier; -#endif -typedef cl_uint cl_kernel_work_group_info; -#ifdef CL_VERSION_2_1 -typedef cl_uint cl_kernel_sub_group_info; -#endif -typedef cl_uint cl_event_info; -typedef cl_uint cl_command_type; -typedef cl_uint cl_profiling_info; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_sampler_properties; -typedef cl_uint cl_kernel_exec_info; -#endif -#ifdef CL_VERSION_3_0 -typedef cl_bitfield cl_device_atomic_capabilities; -typedef cl_uint cl_khronos_vendor_id; -typedef cl_bitfield cl_mem_properties; -typedef cl_uint cl_version; -#endif - -typedef struct _cl_image_format { - cl_channel_order image_channel_order; - cl_channel_type image_channel_data_type; -} cl_image_format; - -#ifdef CL_VERSION_1_2 - -typedef struct _cl_image_desc { - cl_mem_object_type image_type; - size_t image_width; - size_t image_height; - size_t image_depth; - size_t image_array_size; - size_t image_row_pitch; - size_t image_slice_pitch; - cl_uint num_mip_levels; - cl_uint num_samples; -#ifdef CL_VERSION_2_0 -#ifdef __GNUC__ - __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ -#endif -#ifdef _MSC_VER -#pragma warning( push ) -#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */ -#endif - union { -#endif - cl_mem buffer; -#ifdef CL_VERSION_2_0 - cl_mem mem_object; - }; -#ifdef _MSC_VER -#pragma warning( pop ) -#endif -#endif -} cl_image_desc; - -#endif - -#ifdef CL_VERSION_1_1 - -typedef struct _cl_buffer_region { - size_t origin; - size_t size; -} cl_buffer_region; - -#endif - -#ifdef CL_VERSION_3_0 - -#define CL_NAME_VERSION_MAX_NAME_SIZE 64 - -typedef struct _cl_name_version { - cl_version version; - char name[CL_NAME_VERSION_MAX_NAME_SIZE]; -} cl_name_version; - -#endif - -/******************************************************************************/ - -/* Error Codes */ -#define CL_SUCCESS 0 -#define CL_DEVICE_NOT_FOUND -1 -#define CL_DEVICE_NOT_AVAILABLE -2 -#define CL_COMPILER_NOT_AVAILABLE -3 -#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 -#define CL_OUT_OF_RESOURCES -5 -#define CL_OUT_OF_HOST_MEMORY -6 -#define CL_PROFILING_INFO_NOT_AVAILABLE -7 -#define CL_MEM_COPY_OVERLAP -8 -#define CL_IMAGE_FORMAT_MISMATCH -9 -#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 -#define CL_BUILD_PROGRAM_FAILURE -11 -#define CL_MAP_FAILURE -12 -#ifdef CL_VERSION_1_1 -#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 -#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 -#endif -#ifdef CL_VERSION_1_2 -#define CL_COMPILE_PROGRAM_FAILURE -15 -#define CL_LINKER_NOT_AVAILABLE -16 -#define CL_LINK_PROGRAM_FAILURE -17 -#define CL_DEVICE_PARTITION_FAILED -18 -#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 -#endif - -#define CL_INVALID_VALUE -30 -#define CL_INVALID_DEVICE_TYPE -31 -#define CL_INVALID_PLATFORM -32 -#define CL_INVALID_DEVICE -33 -#define CL_INVALID_CONTEXT -34 -#define CL_INVALID_QUEUE_PROPERTIES -35 -#define CL_INVALID_COMMAND_QUEUE -36 -#define CL_INVALID_HOST_PTR -37 -#define CL_INVALID_MEM_OBJECT -38 -#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 -#define CL_INVALID_IMAGE_SIZE -40 -#define CL_INVALID_SAMPLER -41 -#define CL_INVALID_BINARY -42 -#define CL_INVALID_BUILD_OPTIONS -43 -#define CL_INVALID_PROGRAM -44 -#define CL_INVALID_PROGRAM_EXECUTABLE -45 -#define CL_INVALID_KERNEL_NAME -46 -#define CL_INVALID_KERNEL_DEFINITION -47 -#define CL_INVALID_KERNEL -48 -#define CL_INVALID_ARG_INDEX -49 -#define CL_INVALID_ARG_VALUE -50 -#define CL_INVALID_ARG_SIZE -51 -#define CL_INVALID_KERNEL_ARGS -52 -#define CL_INVALID_WORK_DIMENSION -53 -#define CL_INVALID_WORK_GROUP_SIZE -54 -#define CL_INVALID_WORK_ITEM_SIZE -55 -#define CL_INVALID_GLOBAL_OFFSET -56 -#define CL_INVALID_EVENT_WAIT_LIST -57 -#define CL_INVALID_EVENT -58 -#define CL_INVALID_OPERATION -59 -#define CL_INVALID_GL_OBJECT -60 -#define CL_INVALID_BUFFER_SIZE -61 -#define CL_INVALID_MIP_LEVEL -62 -#define CL_INVALID_GLOBAL_WORK_SIZE -63 -#ifdef CL_VERSION_1_1 -#define CL_INVALID_PROPERTY -64 -#endif -#ifdef CL_VERSION_1_2 -#define CL_INVALID_IMAGE_DESCRIPTOR -65 -#define CL_INVALID_COMPILER_OPTIONS -66 -#define CL_INVALID_LINKER_OPTIONS -67 -#define CL_INVALID_DEVICE_PARTITION_COUNT -68 -#endif -#ifdef CL_VERSION_2_0 -#define CL_INVALID_PIPE_SIZE -69 -#define CL_INVALID_DEVICE_QUEUE -70 -#endif -#ifdef CL_VERSION_2_2 -#define CL_INVALID_SPEC_ID -71 -#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 -#endif - - -/* cl_bool */ -#define CL_FALSE 0 -#define CL_TRUE 1 -#ifdef CL_VERSION_1_2 -#define CL_BLOCKING CL_TRUE -#define CL_NON_BLOCKING CL_FALSE -#endif - -/* cl_platform_info */ -#define CL_PLATFORM_PROFILE 0x0900 -#define CL_PLATFORM_VERSION 0x0901 -#define CL_PLATFORM_NAME 0x0902 -#define CL_PLATFORM_VENDOR 0x0903 -#define CL_PLATFORM_EXTENSIONS 0x0904 -#ifdef CL_VERSION_2_1 -#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 -#endif -#ifdef CL_VERSION_3_0 -#define CL_PLATFORM_NUMERIC_VERSION 0x0906 -#define CL_PLATFORM_EXTENSIONS_WITH_VERSION 0x0907 -#endif - -/* cl_device_type - bitfield */ -#define CL_DEVICE_TYPE_DEFAULT (1 << 0) -#define CL_DEVICE_TYPE_CPU (1 << 1) -#define CL_DEVICE_TYPE_GPU (1 << 2) -#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_TYPE_CUSTOM (1 << 4) -#endif -#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF - -/* cl_device_info */ -#define CL_DEVICE_TYPE 0x1000 -#define CL_DEVICE_VENDOR_ID 0x1001 -#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 -#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 -#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 -#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B -#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C -#define CL_DEVICE_ADDRESS_BITS 0x100D -#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E -#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F -#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 -#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 -#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 -#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 -#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 -#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 -#define CL_DEVICE_IMAGE_SUPPORT 0x1016 -#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 -#define CL_DEVICE_MAX_SAMPLERS 0x1018 -#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 -#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A -#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B -#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C -#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D -#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E -#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F -#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 -#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 -#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 -#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 -#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 -#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 -#define CL_DEVICE_ENDIAN_LITTLE 0x1026 -#define CL_DEVICE_AVAILABLE 0x1027 -#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 -#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 -#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ -#ifdef CL_VERSION_2_0 -#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A -#endif -#define CL_DEVICE_NAME 0x102B -#define CL_DEVICE_VENDOR 0x102C -#define CL_DRIVER_VERSION 0x102D -#define CL_DEVICE_PROFILE 0x102E -#define CL_DEVICE_VERSION 0x102F -#define CL_DEVICE_EXTENSIONS 0x1030 -#define CL_DEVICE_PLATFORM 0x1031 -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 -#endif -/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ -#ifdef CL_VERSION_1_1 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 -#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C -#define CL_DEVICE_OPENCL_C_VERSION 0x103D -#endif -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_LINKER_AVAILABLE 0x103E -#define CL_DEVICE_BUILT_IN_KERNELS 0x103F -#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 -#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 -#define CL_DEVICE_PARENT_DEVICE 0x1042 -#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 -#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 -#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 -#define CL_DEVICE_PARTITION_TYPE 0x1046 -#define CL_DEVICE_REFERENCE_COUNT 0x1047 -#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 -#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 -#endif -#ifdef CL_VERSION_2_0 -#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A -#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B -#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C -#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D -#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E -#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F -#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 -#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 -#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 -#define CL_DEVICE_SVM_CAPABILITIES 0x1053 -#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 -#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 -#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 -#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 -#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 -#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 -#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A -#endif -#ifdef CL_VERSION_2_1 -#define CL_DEVICE_IL_VERSION 0x105B -#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C -#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D -#endif -#ifdef CL_VERSION_3_0 -#define CL_DEVICE_NUMERIC_VERSION 0x105E -#define CL_DEVICE_EXTENSIONS_WITH_VERSION 0x1060 -#define CL_DEVICE_ILS_WITH_VERSION 0x1061 -#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION 0x1062 -#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES 0x1063 -#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064 -#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT 0x1065 -#define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066 -#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x1067 -#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068 -#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069 -/* 0x106A to 0x106E - Reserved for upcoming KHR extension */ -#define CL_DEVICE_OPENCL_C_FEATURES 0x106F -#define CL_DEVICE_DEVICE_ENQUEUE_SUPPORT 0x1070 -#define CL_DEVICE_PIPE_SUPPORT 0x1071 -#endif - -/* cl_device_fp_config - bitfield */ -#define CL_FP_DENORM (1 << 0) -#define CL_FP_INF_NAN (1 << 1) -#define CL_FP_ROUND_TO_NEAREST (1 << 2) -#define CL_FP_ROUND_TO_ZERO (1 << 3) -#define CL_FP_ROUND_TO_INF (1 << 4) -#define CL_FP_FMA (1 << 5) -#ifdef CL_VERSION_1_1 -#define CL_FP_SOFT_FLOAT (1 << 6) -#endif -#ifdef CL_VERSION_1_2 -#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) -#endif - -/* cl_device_mem_cache_type */ -#define CL_NONE 0x0 -#define CL_READ_ONLY_CACHE 0x1 -#define CL_READ_WRITE_CACHE 0x2 - -/* cl_device_local_mem_type */ -#define CL_LOCAL 0x1 -#define CL_GLOBAL 0x2 - -/* cl_device_exec_capabilities - bitfield */ -#define CL_EXEC_KERNEL (1 << 0) -#define CL_EXEC_NATIVE_KERNEL (1 << 1) - -/* cl_command_queue_properties - bitfield */ -#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) -#define CL_QUEUE_PROFILING_ENABLE (1 << 1) -#ifdef CL_VERSION_2_0 -#define CL_QUEUE_ON_DEVICE (1 << 2) -#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) -#endif - -/* cl_context_info */ -#define CL_CONTEXT_REFERENCE_COUNT 0x1080 -#define CL_CONTEXT_DEVICES 0x1081 -#define CL_CONTEXT_PROPERTIES 0x1082 -#ifdef CL_VERSION_1_1 -#define CL_CONTEXT_NUM_DEVICES 0x1083 -#endif - -/* cl_context_properties */ -#define CL_CONTEXT_PLATFORM 0x1084 -#ifdef CL_VERSION_1_2 -#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_device_partition_property */ -#define CL_DEVICE_PARTITION_EQUALLY 0x1086 -#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 -#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 -#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_device_affinity_domain */ -#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) -#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) -#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) -#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) -#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) -#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) - -#endif - -#ifdef CL_VERSION_2_0 - -/* cl_device_svm_capabilities */ -#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) -#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) -#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) -#define CL_DEVICE_SVM_ATOMICS (1 << 3) - -#endif - -/* cl_command_queue_info */ -#define CL_QUEUE_CONTEXT 0x1090 -#define CL_QUEUE_DEVICE 0x1091 -#define CL_QUEUE_REFERENCE_COUNT 0x1092 -#define CL_QUEUE_PROPERTIES 0x1093 -#ifdef CL_VERSION_2_0 -#define CL_QUEUE_SIZE 0x1094 -#endif -#ifdef CL_VERSION_2_1 -#define CL_QUEUE_DEVICE_DEFAULT 0x1095 -#endif -#ifdef CL_VERSION_3_0 -#define CL_QUEUE_PROPERTIES_ARRAY 0x1098 -#endif - -/* cl_mem_flags and cl_svm_mem_flags - bitfield */ -#define CL_MEM_READ_WRITE (1 << 0) -#define CL_MEM_WRITE_ONLY (1 << 1) -#define CL_MEM_READ_ONLY (1 << 2) -#define CL_MEM_USE_HOST_PTR (1 << 3) -#define CL_MEM_ALLOC_HOST_PTR (1 << 4) -#define CL_MEM_COPY_HOST_PTR (1 << 5) -/* reserved (1 << 6) */ -#ifdef CL_VERSION_1_2 -#define CL_MEM_HOST_WRITE_ONLY (1 << 7) -#define CL_MEM_HOST_READ_ONLY (1 << 8) -#define CL_MEM_HOST_NO_ACCESS (1 << 9) -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ -#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ -#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_mem_migration_flags - bitfield */ -#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) -#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) - -#endif - -/* cl_channel_order */ -#define CL_R 0x10B0 -#define CL_A 0x10B1 -#define CL_RG 0x10B2 -#define CL_RA 0x10B3 -#define CL_RGB 0x10B4 -#define CL_RGBA 0x10B5 -#define CL_BGRA 0x10B6 -#define CL_ARGB 0x10B7 -#define CL_INTENSITY 0x10B8 -#define CL_LUMINANCE 0x10B9 -#ifdef CL_VERSION_1_1 -#define CL_Rx 0x10BA -#define CL_RGx 0x10BB -#define CL_RGBx 0x10BC -#endif -#ifdef CL_VERSION_1_2 -#define CL_DEPTH 0x10BD -#define CL_DEPTH_STENCIL 0x10BE -#endif -#ifdef CL_VERSION_2_0 -#define CL_sRGB 0x10BF -#define CL_sRGBx 0x10C0 -#define CL_sRGBA 0x10C1 -#define CL_sBGRA 0x10C2 -#define CL_ABGR 0x10C3 -#endif - -/* cl_channel_type */ -#define CL_SNORM_INT8 0x10D0 -#define CL_SNORM_INT16 0x10D1 -#define CL_UNORM_INT8 0x10D2 -#define CL_UNORM_INT16 0x10D3 -#define CL_UNORM_SHORT_565 0x10D4 -#define CL_UNORM_SHORT_555 0x10D5 -#define CL_UNORM_INT_101010 0x10D6 -#define CL_SIGNED_INT8 0x10D7 -#define CL_SIGNED_INT16 0x10D8 -#define CL_SIGNED_INT32 0x10D9 -#define CL_UNSIGNED_INT8 0x10DA -#define CL_UNSIGNED_INT16 0x10DB -#define CL_UNSIGNED_INT32 0x10DC -#define CL_HALF_FLOAT 0x10DD -#define CL_FLOAT 0x10DE -#ifdef CL_VERSION_1_2 -#define CL_UNORM_INT24 0x10DF -#endif -#ifdef CL_VERSION_2_1 -#define CL_UNORM_INT_101010_2 0x10E0 -#endif - -/* cl_mem_object_type */ -#define CL_MEM_OBJECT_BUFFER 0x10F0 -#define CL_MEM_OBJECT_IMAGE2D 0x10F1 -#define CL_MEM_OBJECT_IMAGE3D 0x10F2 -#ifdef CL_VERSION_1_2 -#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 -#define CL_MEM_OBJECT_IMAGE1D 0x10F4 -#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 -#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_OBJECT_PIPE 0x10F7 -#endif - -/* cl_mem_info */ -#define CL_MEM_TYPE 0x1100 -#define CL_MEM_FLAGS 0x1101 -#define CL_MEM_SIZE 0x1102 -#define CL_MEM_HOST_PTR 0x1103 -#define CL_MEM_MAP_COUNT 0x1104 -#define CL_MEM_REFERENCE_COUNT 0x1105 -#define CL_MEM_CONTEXT 0x1106 -#ifdef CL_VERSION_1_1 -#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 -#define CL_MEM_OFFSET 0x1108 -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_USES_SVM_POINTER 0x1109 -#endif -#ifdef CL_VERSION_3_0 -#define CL_MEM_PROPERTIES 0x110A -#endif - -/* cl_image_info */ -#define CL_IMAGE_FORMAT 0x1110 -#define CL_IMAGE_ELEMENT_SIZE 0x1111 -#define CL_IMAGE_ROW_PITCH 0x1112 -#define CL_IMAGE_SLICE_PITCH 0x1113 -#define CL_IMAGE_WIDTH 0x1114 -#define CL_IMAGE_HEIGHT 0x1115 -#define CL_IMAGE_DEPTH 0x1116 -#ifdef CL_VERSION_1_2 -#define CL_IMAGE_ARRAY_SIZE 0x1117 -#define CL_IMAGE_BUFFER 0x1118 -#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 -#define CL_IMAGE_NUM_SAMPLES 0x111A -#endif - - -/* cl_pipe_info */ -#ifdef CL_VERSION_2_0 -#define CL_PIPE_PACKET_SIZE 0x1120 -#define CL_PIPE_MAX_PACKETS 0x1121 -#endif -#ifdef CL_VERSION_3_0 -#define CL_PIPE_PROPERTIES 0x1122 -#endif - -/* cl_addressing_mode */ -#define CL_ADDRESS_NONE 0x1130 -#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 -#define CL_ADDRESS_CLAMP 0x1132 -#define CL_ADDRESS_REPEAT 0x1133 -#ifdef CL_VERSION_1_1 -#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 -#endif - -/* cl_filter_mode */ -#define CL_FILTER_NEAREST 0x1140 -#define CL_FILTER_LINEAR 0x1141 - -/* cl_sampler_info */ -#define CL_SAMPLER_REFERENCE_COUNT 0x1150 -#define CL_SAMPLER_CONTEXT 0x1151 -#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 -#define CL_SAMPLER_ADDRESSING_MODE 0x1153 -#define CL_SAMPLER_FILTER_MODE 0x1154 -#ifdef CL_VERSION_2_0 -/* These enumerants are for the cl_khr_mipmap_image extension. - They have since been added to cl_ext.h with an appropriate - KHR suffix, but are left here for backwards compatibility. */ -#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 -#define CL_SAMPLER_LOD_MIN 0x1156 -#define CL_SAMPLER_LOD_MAX 0x1157 -#endif -#ifdef CL_VERSION_3_0 -#define CL_SAMPLER_PROPERTIES 0x1158 -#endif - -/* cl_map_flags - bitfield */ -#define CL_MAP_READ (1 << 0) -#define CL_MAP_WRITE (1 << 1) -#ifdef CL_VERSION_1_2 -#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) -#endif - -/* cl_program_info */ -#define CL_PROGRAM_REFERENCE_COUNT 0x1160 -#define CL_PROGRAM_CONTEXT 0x1161 -#define CL_PROGRAM_NUM_DEVICES 0x1162 -#define CL_PROGRAM_DEVICES 0x1163 -#define CL_PROGRAM_SOURCE 0x1164 -#define CL_PROGRAM_BINARY_SIZES 0x1165 -#define CL_PROGRAM_BINARIES 0x1166 -#ifdef CL_VERSION_1_2 -#define CL_PROGRAM_NUM_KERNELS 0x1167 -#define CL_PROGRAM_KERNEL_NAMES 0x1168 -#endif -#ifdef CL_VERSION_2_1 -#define CL_PROGRAM_IL 0x1169 -#endif -#ifdef CL_VERSION_2_2 -#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A -#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B -#endif - -/* cl_program_build_info */ -#define CL_PROGRAM_BUILD_STATUS 0x1181 -#define CL_PROGRAM_BUILD_OPTIONS 0x1182 -#define CL_PROGRAM_BUILD_LOG 0x1183 -#ifdef CL_VERSION_1_2 -#define CL_PROGRAM_BINARY_TYPE 0x1184 -#endif -#ifdef CL_VERSION_2_0 -#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_program_binary_type */ -#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 -#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 -#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 -#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 - -#endif - -/* cl_build_status */ -#define CL_BUILD_SUCCESS 0 -#define CL_BUILD_NONE -1 -#define CL_BUILD_ERROR -2 -#define CL_BUILD_IN_PROGRESS -3 - -/* cl_kernel_info */ -#define CL_KERNEL_FUNCTION_NAME 0x1190 -#define CL_KERNEL_NUM_ARGS 0x1191 -#define CL_KERNEL_REFERENCE_COUNT 0x1192 -#define CL_KERNEL_CONTEXT 0x1193 -#define CL_KERNEL_PROGRAM 0x1194 -#ifdef CL_VERSION_1_2 -#define CL_KERNEL_ATTRIBUTES 0x1195 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_info */ -#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 -#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 -#define CL_KERNEL_ARG_TYPE_NAME 0x1198 -#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 -#define CL_KERNEL_ARG_NAME 0x119A - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_address_qualifier */ -#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B -#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C -#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D -#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_access_qualifier */ -#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 -#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 -#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 -#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_type_qualifier */ -#define CL_KERNEL_ARG_TYPE_NONE 0 -#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) -#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) -#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) -#ifdef CL_VERSION_2_0 -#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) -#endif - -#endif - -/* cl_kernel_work_group_info */ -#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 -#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 -#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 -#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 -#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 -#ifdef CL_VERSION_1_2 -#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 -#endif - -#ifdef CL_VERSION_2_1 - -/* cl_kernel_sub_group_info */ -#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 -#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 -#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 -#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 -#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA - -#endif - -#ifdef CL_VERSION_2_0 - -/* cl_kernel_exec_info */ -#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 -#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 - -#endif - -/* cl_event_info */ -#define CL_EVENT_COMMAND_QUEUE 0x11D0 -#define CL_EVENT_COMMAND_TYPE 0x11D1 -#define CL_EVENT_REFERENCE_COUNT 0x11D2 -#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 -#ifdef CL_VERSION_1_1 -#define CL_EVENT_CONTEXT 0x11D4 -#endif - -/* cl_command_type */ -#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 -#define CL_COMMAND_TASK 0x11F1 -#define CL_COMMAND_NATIVE_KERNEL 0x11F2 -#define CL_COMMAND_READ_BUFFER 0x11F3 -#define CL_COMMAND_WRITE_BUFFER 0x11F4 -#define CL_COMMAND_COPY_BUFFER 0x11F5 -#define CL_COMMAND_READ_IMAGE 0x11F6 -#define CL_COMMAND_WRITE_IMAGE 0x11F7 -#define CL_COMMAND_COPY_IMAGE 0x11F8 -#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 -#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA -#define CL_COMMAND_MAP_BUFFER 0x11FB -#define CL_COMMAND_MAP_IMAGE 0x11FC -#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD -#define CL_COMMAND_MARKER 0x11FE -#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF -#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 -#ifdef CL_VERSION_1_1 -#define CL_COMMAND_READ_BUFFER_RECT 0x1201 -#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 -#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 -#define CL_COMMAND_USER 0x1204 -#endif -#ifdef CL_VERSION_1_2 -#define CL_COMMAND_BARRIER 0x1205 -#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 -#define CL_COMMAND_FILL_BUFFER 0x1207 -#define CL_COMMAND_FILL_IMAGE 0x1208 -#endif -#ifdef CL_VERSION_2_0 -#define CL_COMMAND_SVM_FREE 0x1209 -#define CL_COMMAND_SVM_MEMCPY 0x120A -#define CL_COMMAND_SVM_MEMFILL 0x120B -#define CL_COMMAND_SVM_MAP 0x120C -#define CL_COMMAND_SVM_UNMAP 0x120D -#endif -#ifdef CL_VERSION_3_0 -#define CL_COMMAND_SVM_MIGRATE_MEM 0x120E -#endif - -/* command execution status */ -#define CL_COMPLETE 0x0 -#define CL_RUNNING 0x1 -#define CL_SUBMITTED 0x2 -#define CL_QUEUED 0x3 - -/* cl_buffer_create_type */ -#ifdef CL_VERSION_1_1 -#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 -#endif - -/* cl_profiling_info */ -#define CL_PROFILING_COMMAND_QUEUED 0x1280 -#define CL_PROFILING_COMMAND_SUBMIT 0x1281 -#define CL_PROFILING_COMMAND_START 0x1282 -#define CL_PROFILING_COMMAND_END 0x1283 -#ifdef CL_VERSION_2_0 -#define CL_PROFILING_COMMAND_COMPLETE 0x1284 -#endif - -/* cl_device_atomic_capabilities - bitfield */ -#ifdef CL_VERSION_3_0 -#define CL_DEVICE_ATOMIC_ORDER_RELAXED (1 << 0) -#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL (1 << 1) -#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST (1 << 2) -#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM (1 << 3) -#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP (1 << 4) -#define CL_DEVICE_ATOMIC_SCOPE_DEVICE (1 << 5) -#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES (1 << 6) -#endif - -/* cl_khronos_vendor_id */ -#define CL_KHRONOS_VENDOR_ID_CODEPLAY 0x10004 - -#ifdef CL_VERSION_3_0 - -/* cl_version */ -#define CL_VERSION_MAJOR_BITS (10) -#define CL_VERSION_MINOR_BITS (10) -#define CL_VERSION_PATCH_BITS (12) - -#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1) -#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1) -#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1) - -#define CL_VERSION_MAJOR(version) \ - ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) - -#define CL_VERSION_MINOR(version) \ - (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK) - -#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK) - -#define CL_MAKE_VERSION(major, minor, patch) \ - ((((major) & CL_VERSION_MAJOR_MASK) \ - << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) | \ - (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \ - ((patch) & CL_VERSION_PATCH_MASK)) - -#endif - -/********************************************************************************************************/ - -/* Platform API */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPlatformIDs(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPlatformInfo(cl_platform_id platform, - cl_platform_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Device APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDs(cl_platform_id platform, - cl_device_type device_type, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceInfo(cl_device_id device, - cl_device_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateSubDevices(cl_device_id in_device, - const cl_device_partition_property * properties, - cl_uint num_devices, - cl_device_id * out_devices, - cl_uint * num_devices_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetDefaultDeviceCommandQueue(cl_context context, - cl_device_id device, - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceAndHostTimer(cl_device_id device, - cl_ulong* device_timestamp, - cl_ulong* host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetHostTimer(cl_device_id device, - cl_ulong * host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -#endif - -/* Context APIs */ -extern CL_API_ENTRY cl_context CL_API_CALL -clCreateContext(const cl_context_properties * properties, - cl_uint num_devices, - const cl_device_id * devices, - void (CL_CALLBACK * pfn_notify)(const char * errinfo, - const void * private_info, - size_t cb, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_context CL_API_CALL -clCreateContextFromType(const cl_context_properties * properties, - cl_device_type device_type, - void (CL_CALLBACK * pfn_notify)(const char * errinfo, - const void * private_info, - size_t cb, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetContextInfo(cl_context context, - cl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Command Queue APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_command_queue CL_API_CALL -clCreateCommandQueueWithProperties(cl_context context, - cl_device_id device, - const cl_queue_properties * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetCommandQueueInfo(cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Memory Object APIs */ -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateBuffer(cl_context context, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateSubBuffer(cl_mem buffer, - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void * buffer_create_info, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - const cl_image_desc * image_desc, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreatePipe(cl_context context, - cl_mem_flags flags, - cl_uint pipe_packet_size, - cl_uint pipe_max_packets, - const cl_pipe_properties * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_3_0 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateBufferWithProperties(cl_context context, - const cl_mem_properties * properties, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateImageWithProperties(cl_context context, - const cl_mem_properties * properties, - cl_mem_flags flags, - const cl_image_format * image_format, - const cl_image_desc * image_desc, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedImageFormats(cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - cl_image_format * image_formats, - cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMemObjectInfo(cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetImageInfo(cl_mem image, - cl_image_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPipeInfo(cl_mem pipe, - cl_pipe_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetMemObjectDestructorCallback(cl_mem memobj, - void (CL_CALLBACK * pfn_notify)(cl_mem memobj, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_1; - -#endif - -/* SVM Allocation APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY void * CL_API_CALL -clSVMAlloc(cl_context context, - cl_svm_mem_flags flags, - size_t size, - cl_uint alignment) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY void CL_API_CALL -clSVMFree(cl_context context, - void * svm_pointer) CL_API_SUFFIX__VERSION_2_0; - -#endif - -/* Sampler APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_sampler CL_API_CALL -clCreateSamplerWithProperties(cl_context context, - const cl_sampler_properties * sampler_properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSamplerInfo(cl_sampler sampler, - cl_sampler_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Program Object APIs */ -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithSource(cl_context context, - cl_uint count, - const char ** strings, - const size_t * lengths, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithBinary(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const size_t * lengths, - const unsigned char ** binaries, - cl_int * binary_status, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithBuiltInKernels(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const char * kernel_names, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithIL(cl_context context, - const void* il, - size_t length, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clBuildProgram(cl_program program, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clCompileProgram(cl_program program, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - cl_uint num_input_headers, - const cl_program * input_headers, - const char ** header_include_names, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_program CL_API_CALL -clLinkProgram(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - cl_uint num_input_programs, - const cl_program * input_programs, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetProgramReleaseCallback(cl_program program, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_2_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetProgramSpecializationConstant(cl_program program, - cl_uint spec_id, - size_t spec_size, - const void* spec_value) CL_API_SUFFIX__VERSION_2_2; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetProgramInfo(cl_program program, - cl_program_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetProgramBuildInfo(cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Kernel Object APIs */ -extern CL_API_ENTRY cl_kernel CL_API_CALL -clCreateKernel(cl_program program, - const char * kernel_name, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateKernelsInProgram(cl_program program, - cl_uint num_kernels, - cl_kernel * kernels, - cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_kernel CL_API_CALL -clCloneKernel(cl_kernel source_kernel, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArg(cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void * arg_value) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgSVMPointer(cl_kernel kernel, - cl_uint arg_index, - const void * arg_value) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelExecInfo(cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void * param_value) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelInfo(cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelArgInfo(cl_kernel kernel, - cl_uint arg_indx, - cl_kernel_arg_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelWorkGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelSubGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void* input_value, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -/* Event Object APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clWaitForEvents(cl_uint num_events, - const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetEventInfo(cl_event event, - cl_event_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateUserEvent(cl_context context, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetUserEventStatus(cl_event event, - cl_int execution_status) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetEventCallback(cl_event event, - cl_int command_exec_callback_type, - void (CL_CALLBACK * pfn_notify)(cl_event event, - cl_int event_command_status, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_1; - -#endif - -/* Profiling APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetEventProfilingInfo(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Flush and Finish APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -/* Enqueued Commands APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - const size_t * buffer_offset, - const size_t * host_offset, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - const size_t * buffer_offset, - const size_t * host_offset, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueFillBuffer(cl_command_queue command_queue, - cl_mem buffer, - const void * pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBuffer(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBufferRect(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_read, - const size_t * origin, - const size_t * region, - size_t row_pitch, - size_t slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_write, - const size_t * origin, - const size_t * region, - size_t input_row_pitch, - size_t input_slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueFillImage(cl_command_queue command_queue, - cl_mem image, - const void * fill_color, - const size_t * origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyImage(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyImageToBuffer(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_buffer, - const size_t * src_origin, - const size_t * region, - size_t dst_offset, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBufferToImage(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_image, - size_t src_offset, - const size_t * dst_origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY void * CL_API_CALL -clEnqueueMapBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY void * CL_API_CALL -clEnqueueMapImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_map, - cl_map_flags map_flags, - const size_t * origin, - const size_t * region, - size_t * image_row_pitch, - size_t * image_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueUnmapMemObject(cl_command_queue command_queue, - cl_mem memobj, - void * mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemObjects(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueNDRangeKernel(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t * global_work_offset, - const size_t * global_work_size, - const size_t * local_work_size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueNativeKernel(cl_command_queue command_queue, - void (CL_CALLBACK * user_func)(void *), - void * args, - size_t cb_args, - cl_uint num_mem_objects, - const cl_mem * mem_list, - const void ** args_mem_loc, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMarkerWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueBarrierWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMFree(cl_command_queue command_queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void * user_data), - void * user_data, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemcpy(cl_command_queue command_queue, - cl_bool blocking_copy, - void * dst_ptr, - const void * src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemFill(cl_command_queue command_queue, - void * svm_ptr, - const void * pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMap(cl_command_queue command_queue, - cl_bool blocking_map, - cl_map_flags flags, - void * svm_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMUnmap(cl_command_queue command_queue, - void * svm_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMigrateMem(cl_command_queue command_queue, - cl_uint num_svm_pointers, - const void ** svm_pointers, - const size_t * sizes, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_1; - -#endif - -#ifdef CL_VERSION_1_2 - -/* Extension function access - * - * Returns the extension function address for the given function name, - * or NULL if a valid function can not be found. The client must - * check to make sure the address is not NULL, before using or - * calling the returned function address. - */ -extern CL_API_ENTRY void * CL_API_CALL -clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, - const char * func_name) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - /* - * WARNING: - * This API introduces mutable state into the OpenCL implementation. It has been REMOVED - * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the - * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. - * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. - * - * Software developers previously relying on this API are instructed to set the command queue - * properties when creating the queue, instead. - */ - extern CL_API_ENTRY cl_int CL_API_CALL - clSetCommandQueueProperty(cl_command_queue command_queue, - cl_command_queue_properties properties, - cl_bool enable, - cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; -#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ - -/* Deprecated OpenCL 1.1 APIs */ -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateImage2D(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - size_t image_width, - size_t image_height, - size_t image_row_pitch, - void * host_ptr, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateImage3D(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - size_t image_width, - size_t image_height, - size_t image_depth, - size_t image_row_pitch, - size_t image_slice_pitch, - void * host_ptr, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueMarker(cl_command_queue command_queue, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueWaitForEvents(cl_command_queue command_queue, - cl_uint num_events, - const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL -clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -/* Deprecated OpenCL 2.0 APIs */ -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL -clCreateCommandQueue(cl_context context, - cl_device_id device, - cl_command_queue_properties properties, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL -clCreateSampler(cl_context context, - cl_bool normalized_coords, - cl_addressing_mode addressing_mode, - cl_filter_mode filter_mode, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL -clEnqueueTask(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_H */ diff --git a/CL/cl_d3d10.h b/CL/cl_d3d10.h deleted file mode 100644 index cda5469..0000000 --- a/CL/cl_d3d10.h +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_D3D10_H -#define __OPENCL_CL_D3D10_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * cl_khr_d3d10_sharing */ -#define cl_khr_d3d10_sharing 1 - -typedef cl_uint cl_d3d10_device_source_khr; -typedef cl_uint cl_d3d10_device_set_khr; - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_D3D10_DEVICE_KHR -1002 -#define CL_INVALID_D3D10_RESOURCE_KHR -1003 -#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 -#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 - -/* cl_d3d10_device_source_nv */ -#define CL_D3D10_DEVICE_KHR 0x4010 -#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 - -/* cl_d3d10_device_set_nv */ -#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 -#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 - -/* cl_context_info */ -#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 -#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C - -/* cl_mem_info */ -#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 - -/* cl_image_info */ -#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 -#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 - -/******************************************************************************/ - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Buffer * resource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture2D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture3D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_D3D10_H */ - diff --git a/CL/cl_d3d11.h b/CL/cl_d3d11.h deleted file mode 100644 index 6b7e2e9..0000000 --- a/CL/cl_d3d11.h +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_D3D11_H -#define __OPENCL_CL_D3D11_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * cl_khr_d3d11_sharing */ -#define cl_khr_d3d11_sharing 1 - -typedef cl_uint cl_d3d11_device_source_khr; -typedef cl_uint cl_d3d11_device_set_khr; - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_D3D11_DEVICE_KHR -1006 -#define CL_INVALID_D3D11_RESOURCE_KHR -1007 -#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 -#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 - -/* cl_d3d11_device_source */ -#define CL_D3D11_DEVICE_KHR 0x4019 -#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A - -/* cl_d3d11_device_set */ -#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B -#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C - -/* cl_context_info */ -#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D -#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D - -/* cl_mem_info */ -#define CL_MEM_D3D11_RESOURCE_KHR 0x401E - -/* cl_image_info */ -#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 -#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 - -/******************************************************************************/ - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( - cl_platform_id platform, - cl_d3d11_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Buffer * resource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Texture2D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Texture3D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_D3D11_H */ - diff --git a/CL/cl_dx9_media_sharing.h b/CL/cl_dx9_media_sharing.h deleted file mode 100644 index 0489370..0000000 --- a/CL/cl_dx9_media_sharing.h +++ /dev/null @@ -1,118 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H -#define __OPENCL_CL_DX9_MEDIA_SHARING_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/******************************************************************************/ -/* cl_khr_dx9_media_sharing */ -#define cl_khr_dx9_media_sharing 1 - -typedef cl_uint cl_dx9_media_adapter_type_khr; -typedef cl_uint cl_dx9_media_adapter_set_khr; - -#if defined(_WIN32) -#include -typedef struct _cl_dx9_surface_info_khr -{ - IDirect3DSurface9 *resource; - HANDLE shared_handle; -} cl_dx9_surface_info_khr; -#endif - - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 -#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 -#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 -#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 - -/* cl_media_adapter_type_khr */ -#define CL_ADAPTER_D3D9_KHR 0x2020 -#define CL_ADAPTER_D3D9EX_KHR 0x2021 -#define CL_ADAPTER_DXVA_KHR 0x2022 - -/* cl_media_adapter_set_khr */ -#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 -#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 - -/* cl_context_info */ -#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 -#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 -#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 - -/* cl_mem_info */ -#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 -#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 - -/* cl_image_info */ -#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B -#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C - -/******************************************************************************/ - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( - cl_platform_id platform, - cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr * media_adapter_type, - void * media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( - cl_context context, - cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, - void * surface_info, - cl_uint plane, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ - diff --git a/CL/cl_dx9_media_sharing_intel.h b/CL/cl_dx9_media_sharing_intel.h deleted file mode 100644 index 4525a17..0000000 --- a/CL/cl_dx9_media_sharing_intel.h +++ /dev/null @@ -1,170 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -/*****************************************************************************\ - -Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. - -THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE -MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -File Name: cl_dx9_media_sharing_intel.h - -Abstract: - -Notes: - -\*****************************************************************************/ - -#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H -#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H - -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/*************************************** -* cl_intel_dx9_media_sharing extension * -****************************************/ - -#define cl_intel_dx9_media_sharing 1 - -typedef cl_uint cl_dx9_device_source_intel; -typedef cl_uint cl_dx9_device_set_intel; - -/* error codes */ -#define CL_INVALID_DX9_DEVICE_INTEL -1010 -#define CL_INVALID_DX9_RESOURCE_INTEL -1011 -#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 -#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 - -/* cl_dx9_device_source_intel */ -#define CL_D3D9_DEVICE_INTEL 0x4022 -#define CL_D3D9EX_DEVICE_INTEL 0x4070 -#define CL_DXVA_DEVICE_INTEL 0x4071 - -/* cl_dx9_device_set_intel */ -#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 -#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 - -/* cl_context_info */ -#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 -#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 -#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 - -/* cl_mem_info */ -#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 -#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 - -/* cl_image_info */ -#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A -#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B -/******************************************************************************/ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDsFromDX9INTEL( - cl_platform_id platform, - cl_dx9_device_source_intel dx9_device_source, - void* dx9_object, - cl_dx9_device_set_intel dx9_device_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( - cl_platform_id platform, - cl_dx9_device_source_intel dx9_device_source, - void* dx9_object, - cl_dx9_device_set_intel dx9_device_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromDX9MediaSurfaceINTEL( - cl_context context, - cl_mem_flags flags, - IDirect3DSurface9* resource, - HANDLE sharedHandle, - UINT plane, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( - cl_context context, - cl_mem_flags flags, - IDirect3DSurface9* resource, - HANDLE sharedHandle, - UINT plane, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireDX9ObjectsINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseDX9ObjectsINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ - diff --git a/CL/cl_egl.h b/CL/cl_egl.h deleted file mode 100644 index c8bde80..0000000 --- a/CL/cl_egl.h +++ /dev/null @@ -1,120 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_EGL_H -#define __OPENCL_CL_EGL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ -#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F -#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D -#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E - -/* Error type for clCreateFromEGLImageKHR */ -#define CL_INVALID_EGL_OBJECT_KHR -1093 -#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 - -/* CLeglImageKHR is an opaque handle to an EGLImage */ -typedef void* CLeglImageKHR; - -/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ -typedef void* CLeglDisplayKHR; - -/* CLeglSyncKHR is an opaque handle to an EGLSync object */ -typedef void* CLeglSyncKHR; - -/* properties passed to clCreateFromEGLImageKHR */ -typedef intptr_t cl_egl_image_properties_khr; - - -#define cl_khr_egl_image 1 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromEGLImageKHR(cl_context context, - CLeglDisplayKHR egldisplay, - CLeglImageKHR eglimage, - cl_mem_flags flags, - const cl_egl_image_properties_khr * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( - cl_context context, - CLeglDisplayKHR egldisplay, - CLeglImageKHR eglimage, - cl_mem_flags flags, - const cl_egl_image_properties_khr * properties, - cl_int * errcode_ret); - - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -#define cl_khr_egl_event 1 - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateEventFromEGLSyncKHR(cl_context context, - CLeglSyncKHR sync, - CLeglDisplayKHR display, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( - cl_context context, - CLeglSyncKHR sync, - CLeglDisplayKHR display, - cl_int * errcode_ret); - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_EGL_H */ diff --git a/CL/cl_ext.h b/CL/cl_ext.h deleted file mode 100644 index cd86843..0000000 --- a/CL/cl_ext.h +++ /dev/null @@ -1,841 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -/* cl_ext.h contains OpenCL extensions which don't have external */ -/* (OpenGL, D3D) dependencies. */ - -#ifndef __CL_EXT_H -#define __CL_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/* cl_khr_fp64 extension - no extension #define since it has no functions */ -/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ - -#if CL_TARGET_OPENCL_VERSION <= 110 -#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 -#endif - -/* cl_khr_fp16 extension - no extension #define since it has no functions */ -#define CL_DEVICE_HALF_FP_CONFIG 0x1033 - -/* Memory object destruction - * - * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR - * - * Registers a user callback function that will be called when the memory object is deleted and its resources - * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback - * stack associated with memobj. The registered user callback functions are called in the reverse order in - * which they were registered. The user callback functions are called and then the memory object is deleted - * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be - * notified when the memory referenced by host_ptr, specified when the memory object is created and used as - * the storage bits for the memory object, can be reused or freed. - * - * The application may not call CL api's with the cl_mem object passed to the pfn_notify. - * - * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) - * before using. - */ -#define cl_APPLE_SetMemObjectDestructor 1 -cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj, - void (* pfn_notify)(cl_mem memobj, void * user_data), - void * user_data) CL_EXT_SUFFIX__VERSION_1_0; - - -/* Context Logging Functions - * - * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). - * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) - * before using. - * - * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger - */ -#define cl_APPLE_ContextLoggingFunctions 1 -extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_EXT_SUFFIX__VERSION_1_0; - -/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ -extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_EXT_SUFFIX__VERSION_1_0; - -/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ -extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_EXT_SUFFIX__VERSION_1_0; - - -/************************ -* cl_khr_icd extension * -************************/ -#define cl_khr_icd 1 - -/* cl_platform_info */ -#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 - -/* Additional Error Codes */ -#define CL_PLATFORM_NOT_FOUND_KHR -1001 - -extern CL_API_ENTRY cl_int CL_API_CALL -clIcdGetPlatformIDsKHR(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms); - -typedef CL_API_ENTRY cl_int -(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms); - - -/******************************* - * cl_khr_il_program extension * - *******************************/ -#define cl_khr_il_program 1 - -/* New property to clGetDeviceInfo for retrieving supported intermediate - * languages - */ -#define CL_DEVICE_IL_VERSION_KHR 0x105B - -/* New property to clGetProgramInfo for retrieving for retrieving the IL of a - * program - */ -#define CL_PROGRAM_IL_KHR 0x1169 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithILKHR(cl_context context, - const void * il, - size_t length, - cl_int * errcode_ret); - -typedef CL_API_ENTRY cl_program -(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context, - const void * il, - size_t length, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -/* Extension: cl_khr_image2d_from_buffer - * - * This extension allows a 2D image to be created from a cl_mem buffer without - * a copy. The type associated with a 2D image created from a buffer in an - * OpenCL program is image2d_t. Both the sampler and sampler-less read_image - * built-in functions are supported for 2D images and 2D images created from - * a buffer. Similarly, the write_image built-ins are also supported for 2D - * images created from a buffer. - * - * When the 2D image from buffer is created, the client must specify the - * width, height, image format (i.e. channel order and channel data type) - * and optionally the row pitch. - * - * The pitch specified must be a multiple of - * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels. - * The base address of the buffer must be aligned to - * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels. - */ - -#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A -#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B - - -/************************************** - * cl_khr_initialize_memory extension * - **************************************/ - -#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 - - -/************************************** - * cl_khr_terminate_context extension * - **************************************/ - -#define CL_CONTEXT_TERMINATED_KHR -1121 - -#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 -#define CL_CONTEXT_TERMINATE_KHR 0x2032 - -#define cl_khr_terminate_context 1 -extern CL_API_ENTRY cl_int CL_API_CALL -clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int -(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; - - -/* - * Extension: cl_khr_spir - * - * This extension adds support to create an OpenCL program object from a - * Standard Portable Intermediate Representation (SPIR) instance - */ - -#define CL_DEVICE_SPIR_VERSIONS 0x40E0 -#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 - - -/***************************************** - * cl_khr_create_command_queue extension * - *****************************************/ -#define cl_khr_create_command_queue 1 - -typedef cl_bitfield cl_queue_properties_khr; - -extern CL_API_ENTRY cl_command_queue CL_API_CALL -clCreateCommandQueueWithPropertiesKHR(cl_context context, - cl_device_id device, - const cl_queue_properties_khr* properties, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_command_queue -(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context, - cl_device_id device, - const cl_queue_properties_khr* properties, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - - -/****************************************** -* cl_nv_device_attribute_query extension * -******************************************/ - -/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ -#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 -#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 -#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 -#define CL_DEVICE_WARP_SIZE_NV 0x4003 -#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 -#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 -#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 - - -/********************************* -* cl_amd_device_attribute_query * -*********************************/ - -#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 -#define CL_DEVICE_TOPOLOGY_AMD 0x4037 -#define CL_DEVICE_BOARD_NAME_AMD 0x4038 -#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 -#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 -#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 -#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 -#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 -#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 -#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 -#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 -#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 -#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 -#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049 -#define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A -#define CL_DEVICE_GFXIP_MINOR_AMD 0x404B -#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C -#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 -#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 -#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 -#define CL_DEVICE_PCIE_ID_AMD 0x4034 - - -/********************************* -* cl_arm_printf extension -*********************************/ - -#define CL_PRINTF_CALLBACK_ARM 0x40B0 -#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 - - -/*********************************** -* cl_ext_device_fission extension -***********************************/ -#define cl_ext_device_fission 1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int -(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int -(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; - -typedef cl_ulong cl_device_partition_property_ext; -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateSubDevicesEXT(cl_device_id in_device, - const cl_device_partition_property_ext * properties, - cl_uint num_entries, - cl_device_id * out_devices, - cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int -(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device, - const cl_device_partition_property_ext * properties, - cl_uint num_entries, - cl_device_id * out_devices, - cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; - -/* cl_device_partition_property_ext */ -#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 -#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 -#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 -#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 - -/* clDeviceGetInfo selectors */ -#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 -#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 -#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 -#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 -#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 - -/* error codes */ -#define CL_DEVICE_PARTITION_FAILED_EXT -1057 -#define CL_INVALID_PARTITION_COUNT_EXT -1058 -#define CL_INVALID_PARTITION_NAME_EXT -1059 - -/* CL_AFFINITY_DOMAINs */ -#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 -#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 -#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 -#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 -#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 -#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 - -/* cl_device_partition_property_ext list terminators */ -#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) -#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) -#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) - - -/*********************************** - * cl_ext_migrate_memobject extension definitions - ***********************************/ -#define cl_ext_migrate_memobject 1 - -typedef cl_bitfield cl_mem_migration_flags_ext; - -#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 - -#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags_ext flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - -typedef CL_API_ENTRY cl_int -(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags_ext flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -/********************************* -* cl_qcom_ext_host_ptr extension -*********************************/ -#define cl_qcom_ext_host_ptr 1 - -#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) - -#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 -#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 -#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 -#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 -#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 -#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 -#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 -#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 - -typedef cl_uint cl_image_pitch_info_qcom; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceImageInfoQCOM(cl_device_id device, - size_t image_width, - size_t image_height, - const cl_image_format *image_format, - cl_image_pitch_info_qcom param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - -typedef struct _cl_mem_ext_host_ptr -{ - /* Type of external memory allocation. */ - /* Legal values will be defined in layered extensions. */ - cl_uint allocation_type; - - /* Host cache policy for this external memory allocation. */ - cl_uint host_cache_policy; - -} cl_mem_ext_host_ptr; - - -/******************************************* -* cl_qcom_ext_host_ptr_iocoherent extension -********************************************/ - -/* Cache policy specifying io-coherence */ -#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 - - -/********************************* -* cl_qcom_ion_host_ptr extension -*********************************/ - -#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 - -typedef struct _cl_mem_ion_host_ptr -{ - /* Type of external memory allocation. */ - /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ - cl_mem_ext_host_ptr ext_host_ptr; - - /* ION file descriptor */ - int ion_filedesc; - - /* Host pointer to the ION allocated memory */ - void* ion_hostptr; - -} cl_mem_ion_host_ptr; - - -/********************************* -* cl_qcom_android_native_buffer_host_ptr extension -*********************************/ - -#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 - -typedef struct _cl_mem_android_native_buffer_host_ptr -{ - /* Type of external memory allocation. */ - /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ - cl_mem_ext_host_ptr ext_host_ptr; - - /* Virtual pointer to the android native buffer */ - void* anb_ptr; - -} cl_mem_android_native_buffer_host_ptr; - - -/****************************************** - * cl_img_yuv_image extension * - ******************************************/ - -/* Image formats used in clCreateImage */ -#define CL_NV21_IMG 0x40D0 -#define CL_YV12_IMG 0x40D1 - - -/****************************************** - * cl_img_cached_allocations extension * - ******************************************/ - -/* Flag values used by clCreateBuffer */ -#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) -#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) - - -/****************************************** - * cl_img_use_gralloc_ptr extension * - ******************************************/ -#define cl_img_use_gralloc_ptr 1 - -/* Flag values used by clCreateBuffer */ -#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) - -/* To be used by clGetEventInfo: */ -#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 -#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 - -/* Error code from clEnqueueReleaseGrallocObjectsIMG */ -#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - - -/********************************* -* cl_khr_subgroups extension -*********************************/ -#define cl_khr_subgroups 1 - -#if !defined(CL_VERSION_2_1) -/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. - In hindsight, there should have been a khr suffix on this type for - the extension, but keeping it un-suffixed to maintain backwards - compatibility. */ -typedef cl_uint cl_kernel_sub_group_info; -#endif - -/* cl_kernel_sub_group_info */ -#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 -#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelSubGroupInfoKHR(cl_kernel in_kernel, - cl_device_id in_device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void * input_value, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; - -typedef CL_API_ENTRY cl_int -(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel, - cl_device_id in_device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void * input_value, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; - - -/********************************* -* cl_khr_mipmap_image extension -*********************************/ - -/* cl_sampler_properties */ -#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155 -#define CL_SAMPLER_LOD_MIN_KHR 0x1156 -#define CL_SAMPLER_LOD_MAX_KHR 0x1157 - - -/********************************* -* cl_khr_priority_hints extension -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_priority_hints 1 - -typedef cl_uint cl_queue_priority_khr; - -/* cl_command_queue_properties */ -#define CL_QUEUE_PRIORITY_KHR 0x1096 - -/* cl_queue_priority_khr */ -#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) -#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) -#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) - - -/********************************* -* cl_khr_throttle_hints extension -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_throttle_hints 1 - -typedef cl_uint cl_queue_throttle_khr; - -/* cl_command_queue_properties */ -#define CL_QUEUE_THROTTLE_KHR 0x1097 - -/* cl_queue_throttle_khr */ -#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) -#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) -#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) - - -/********************************* -* cl_khr_subgroup_named_barrier -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_subgroup_named_barrier 1 - -/* cl_device_info */ -#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 - - -/********************************* -* cl_khr_extended_versioning -*********************************/ - -#define cl_khr_extended_versioning 1 - -#define CL_VERSION_MAJOR_BITS_KHR (10) -#define CL_VERSION_MINOR_BITS_KHR (10) -#define CL_VERSION_PATCH_BITS_KHR (12) - -#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1) -#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1) -#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1) - -#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) -#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR) -#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR) - -#define CL_MAKE_VERSION_KHR(major, minor, patch) \ - ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \ - (((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \ - ((patch) & CL_VERSION_PATCH_MASK_KHR)) - -typedef cl_uint cl_version_khr; - -#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64 - -typedef struct _cl_name_version_khr -{ - cl_version_khr version; - char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR]; -} cl_name_version_khr; - -/* cl_platform_info */ -#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906 -#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907 - -/* cl_device_info */ -#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E -#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F -#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060 -#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061 -#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062 - - -/********************************* -* cl_khr_device_uuid extension -*********************************/ -#define cl_khr_device_uuid 1 - -#define CL_UUID_SIZE_KHR 16 -#define CL_LUID_SIZE_KHR 8 - -#define CL_DEVICE_UUID_KHR 0x106A -#define CL_DRIVER_UUID_KHR 0x106B -#define CL_DEVICE_LUID_VALID_KHR 0x106C -#define CL_DEVICE_LUID_KHR 0x106D -#define CL_DEVICE_NODE_MASK_KHR 0x106E - - -/********************************** - * cl_arm_import_memory extension * - **********************************/ -#define cl_arm_import_memory 1 - -typedef intptr_t cl_import_properties_arm; - -/* Default and valid proporties name for cl_arm_import_memory */ -#define CL_IMPORT_TYPE_ARM 0x40B2 - -/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 - -/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 - -/* Protected memory property */ -#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5 - -/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2 - -/* Data consistency with host property */ -#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3 - -/* Import memory size value to indicate a size for the whole buffer */ -#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX - -/* This extension adds a new function that allows for direct memory import into - * OpenCL via the clImportMemoryARM function. - * - * Memory imported through this interface will be mapped into the device's page - * tables directly, providing zero copy access. It will never fall back to copy - * operations and aliased buffers. - * - * Types of memory supported for import are specified as additional extension - * strings. - * - * This extension produces cl_mem allocations which are compatible with all other - * users of cl_mem in the standard API. - * - * This extension maps pages with the same properties as the normal buffer creation - * function clCreateBuffer. - */ -extern CL_API_ENTRY cl_mem CL_API_CALL -clImportMemoryARM( cl_context context, - cl_mem_flags flags, - const cl_import_properties_arm *properties, - void *memory, - size_t size, - cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; - - -/****************************************** - * cl_arm_shared_virtual_memory extension * - ******************************************/ -#define cl_arm_shared_virtual_memory 1 - -/* Used by clGetDeviceInfo */ -#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 - -/* Used by clGetMemObjectInfo */ -#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 - -/* Used by clSetKernelExecInfoARM: */ -#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 -#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 - -/* To be used by clGetEventInfo: */ -#define CL_COMMAND_SVM_FREE_ARM 0x40BA -#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB -#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC -#define CL_COMMAND_SVM_MAP_ARM 0x40BD -#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE - -/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ -#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) -#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) -#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) -#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) - -/* Flag values used by clSVMAllocARM: */ -#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) -#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) - -typedef cl_bitfield cl_svm_mem_flags_arm; -typedef cl_uint cl_kernel_exec_info_arm; -typedef cl_bitfield cl_device_svm_capabilities_arm; - -extern CL_API_ENTRY void * CL_API_CALL -clSVMAllocARM(cl_context context, - cl_svm_mem_flags_arm flags, - size_t size, - cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY void CL_API_CALL -clSVMFreeARM(cl_context context, - void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMFreeARM(cl_command_queue command_queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void * user_data), - void * user_data, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemcpyARM(cl_command_queue command_queue, - cl_bool blocking_copy, - void * dst_ptr, - const void * src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemFillARM(cl_command_queue command_queue, - void * svm_ptr, - const void * pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMapARM(cl_command_queue command_queue, - cl_bool blocking_map, - cl_map_flags flags, - void * svm_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMUnmapARM(cl_command_queue command_queue, - void * svm_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgSVMPointerARM(cl_kernel kernel, - cl_uint arg_index, - const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelExecInfoARM(cl_kernel kernel, - cl_kernel_exec_info_arm param_name, - size_t param_value_size, - const void * param_value) CL_EXT_SUFFIX__VERSION_1_2; - -/******************************** - * cl_arm_get_core_id extension * - ********************************/ - -#ifdef CL_VERSION_1_2 - -#define cl_arm_get_core_id 1 - -/* Device info property for bitfield of cores present */ -#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF - -#endif /* CL_VERSION_1_2 */ - -/********************************* -* cl_arm_job_slot_selection -*********************************/ - -#define cl_arm_job_slot_selection 1 - -/* cl_device_info */ -#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0 - -/* cl_command_queue_properties */ -#define CL_QUEUE_JOB_SLOT_ARM 0x41E1 - -#ifdef __cplusplus -} -#endif - - -#endif /* __CL_EXT_H */ diff --git a/CL/cl_ext_intel.h b/CL/cl_ext_intel.h deleted file mode 100644 index f044684..0000000 --- a/CL/cl_ext_intel.h +++ /dev/null @@ -1,682 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ******************************************************************************/ -/*****************************************************************************\ - -Copyright (c) 2013-2020 Intel Corporation All Rights Reserved. - -THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE -MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -File Name: cl_ext_intel.h - -Abstract: - -Notes: - -\*****************************************************************************/ - -#ifndef __CL_EXT_INTEL_H -#define __CL_EXT_INTEL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/*************************************** -* cl_intel_thread_local_exec extension * -****************************************/ - -#define cl_intel_thread_local_exec 1 - -#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) - -/*********************************************** -* cl_intel_device_partition_by_names extension * -************************************************/ - -#define cl_intel_device_partition_by_names 1 - -#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 -#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 - -/************************************************ -* cl_intel_accelerator extension * -* cl_intel_motion_estimation extension * -* cl_intel_advanced_motion_estimation extension * -*************************************************/ - -#define cl_intel_accelerator 1 -#define cl_intel_motion_estimation 1 -#define cl_intel_advanced_motion_estimation 1 - -typedef struct _cl_accelerator_intel* cl_accelerator_intel; -typedef cl_uint cl_accelerator_type_intel; -typedef cl_uint cl_accelerator_info_intel; - -typedef struct _cl_motion_estimation_desc_intel { - cl_uint mb_block_type; - cl_uint subpixel_mode; - cl_uint sad_adjust_mode; - cl_uint search_path_type; -} cl_motion_estimation_desc_intel; - -/* error codes */ -#define CL_INVALID_ACCELERATOR_INTEL -1094 -#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 -#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 -#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 - -/* cl_accelerator_type_intel */ -#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 - -/* cl_accelerator_info_intel */ -#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 -#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 -#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 -#define CL_ACCELERATOR_TYPE_INTEL 0x4093 - -/* cl_motion_detect_desc_intel flags */ -#define CL_ME_MB_TYPE_16x16_INTEL 0x0 -#define CL_ME_MB_TYPE_8x8_INTEL 0x1 -#define CL_ME_MB_TYPE_4x4_INTEL 0x2 - -#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 - -#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 - -#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 -#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 -#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 - -#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 -#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 -#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 -#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 - -#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 -#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 -#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 - -#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 -#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 -#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 -#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 -#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 - -#define CL_ME_COST_PENALTY_NONE_INTEL 0x0 -#define CL_ME_COST_PENALTY_LOW_INTEL 0x1 -#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 -#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 - -#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 -#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 -#define CL_ME_COST_PRECISION_PEL_INTEL 0x2 -#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 - -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 - -#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 - -#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 - -/* cl_device_info */ -#define CL_DEVICE_ME_VERSION_INTEL 0x407E - -#define CL_ME_VERSION_LEGACY_INTEL 0x0 -#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 -#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 - -extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL -clCreateAcceleratorINTEL( - cl_context context, - cl_accelerator_type_intel accelerator_type, - size_t descriptor_size, - const void* descriptor, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( - cl_context context, - cl_accelerator_type_intel accelerator_type, - size_t descriptor_size, - const void* descriptor, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetAcceleratorInfoINTEL( - cl_accelerator_intel accelerator, - cl_accelerator_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( - cl_accelerator_intel accelerator, - cl_accelerator_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainAcceleratorINTEL( - cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( - cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseAcceleratorINTEL( - cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( - cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; - -/****************************************** -* cl_intel_simultaneous_sharing extension * -*******************************************/ - -#define cl_intel_simultaneous_sharing 1 - -#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 -#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 - -/*********************************** -* cl_intel_egl_image_yuv extension * -************************************/ - -#define cl_intel_egl_image_yuv 1 - -#define CL_EGL_YUV_PLANE_INTEL 0x4107 - -/******************************** -* cl_intel_packed_yuv extension * -*********************************/ - -#define cl_intel_packed_yuv 1 - -#define CL_YUYV_INTEL 0x4076 -#define CL_UYVY_INTEL 0x4077 -#define CL_YVYU_INTEL 0x4078 -#define CL_VYUY_INTEL 0x4079 - -/******************************************** -* cl_intel_required_subgroup_size extension * -*********************************************/ - -#define cl_intel_required_subgroup_size 1 - -#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 -#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 -#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A - -/**************************************** -* cl_intel_driver_diagnostics extension * -*****************************************/ - -#define cl_intel_driver_diagnostics 1 - -typedef cl_uint cl_diagnostics_verbose_level; - -#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 - -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) - -/******************************** -* cl_intel_planar_yuv extension * -*********************************/ - -#define CL_NV12_INTEL 0x410E - -#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) -#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) - -#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E -#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F - -/******************************************************* -* cl_intel_device_side_avc_motion_estimation extension * -********************************************************/ - -#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B -#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C -#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D - -#define CL_AVC_ME_VERSION_0_INTEL 0x0 /* No support. */ -#define CL_AVC_ME_VERSION_1_INTEL 0x1 /* First supported version. */ - -#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 -#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 -#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 -#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 - -#define CL_AVC_ME_MINOR_8x8_INTEL 0x0 -#define CL_AVC_ME_MINOR_8x4_INTEL 0x1 -#define CL_AVC_ME_MINOR_4x8_INTEL 0x2 -#define CL_AVC_ME_MINOR_4x4_INTEL 0x3 - -#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 -#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 -#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 - -#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 -#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E -#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D -#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B -#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 -#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F -#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F -#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F - -#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 -#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 -#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 -#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 -#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 -#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 -#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 -#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 -#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 -#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 -#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 -#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa - -#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 - -#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 - -#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 -#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 -#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 -#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 - -#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 -#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 -#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 -#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B -#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 - -#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 -#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 -#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 -#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 - -#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 -#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 - -#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) - -#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 -#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 - -#define CL_AVC_ME_INTRA_16x16_INTEL 0x0 -#define CL_AVC_ME_INTRA_8x8_INTEL 0x1 -#define CL_AVC_ME_INTRA_4x4_INTEL 0x2 - -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 - -#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 - -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 - -#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 -#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 -#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 - -#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 -#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 -#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 - -#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 -#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 - -/******************************************* -* cl_intel_unified_shared_memory extension * -********************************************/ - -/* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */ - -#define cl_intel_unified_shared_memory 1 - -/* cl_device_info */ -#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 -#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 -#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 -#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 -#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 - -typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; - -/* cl_device_unified_shared_memory_capabilities_intel - bitfield */ -#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) -#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3) - -typedef cl_bitfield cl_mem_properties_intel; - -/* cl_mem_properties_intel */ -#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 - -typedef cl_bitfield cl_mem_alloc_flags_intel; - -/* cl_mem_alloc_flags_intel - bitfield */ -#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) - -typedef cl_uint cl_mem_info_intel; - -/* cl_mem_alloc_info_intel */ -#define CL_MEM_ALLOC_TYPE_INTEL 0x419A -#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B -#define CL_MEM_ALLOC_SIZE_INTEL 0x419C -#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D -/* Enum values 0x419E-0x419F are reserved for future queries. */ - -typedef cl_uint cl_unified_shared_memory_type_intel; - -/* cl_unified_shared_memory_type_intel */ -#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 -#define CL_MEM_TYPE_HOST_INTEL 0x4197 -#define CL_MEM_TYPE_DEVICE_INTEL 0x4198 -#define CL_MEM_TYPE_SHARED_INTEL 0x4199 - -typedef cl_uint cl_mem_advice_intel; - -/* cl_mem_advice_intel */ -/* Enum values 0x4208-0x420F are reserved for future memory advices. */ - -/* cl_kernel_exec_info */ -#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200 -#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201 -#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202 -#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203 - -/* cl_command_type */ -#define CL_COMMAND_MEMFILL_INTEL 0x4204 -#define CL_COMMAND_MEMCPY_INTEL 0x4205 -#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 -#define CL_COMMAND_MEMADVISE_INTEL 0x4207 - -extern CL_API_ENTRY void* CL_API_CALL -clHostMemAllocINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -typedef CL_API_ENTRY void* (CL_API_CALL * -clHostMemAllocINTEL_fn)( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY void* CL_API_CALL -clDeviceMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -typedef CL_API_ENTRY void* (CL_API_CALL * -clDeviceMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY void* CL_API_CALL -clSharedMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -typedef CL_API_ENTRY void* (CL_API_CALL * -clSharedMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL -clMemFreeINTEL( - cl_context context, - void* ptr); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clMemFreeINTEL_fn)( - cl_context context, - void* ptr); - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMemAllocInfoINTEL( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clGetMemAllocInfoINTEL_fn)( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgMemPointerINTEL( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clSetKernelArgMemPointerINTEL_fn)( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemsetINTEL( /* Deprecated */ - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clEnqueueMemsetINTEL_fn)( /* Deprecated */ - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemFillINTEL( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clEnqueueMemFillINTEL_fn)( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemcpyINTEL( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clEnqueueMemcpyINTEL_fn)( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -#ifdef CL_VERSION_1_2 - -/* Because these APIs use cl_mem_migration_flags, they require - OpenCL 1.2: */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clEnqueueMigrateMemINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemAdviseINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -clEnqueueMemAdviseINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -#ifdef __cplusplus -} -#endif - -#endif /* __CL_EXT_INTEL_H */ diff --git a/CL/cl_gl.h b/CL/cl_gl.h deleted file mode 100644 index b587f02..0000000 --- a/CL/cl_gl.h +++ /dev/null @@ -1,159 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_GL_H -#define __OPENCL_CL_GL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef cl_uint cl_gl_object_type; -typedef cl_uint cl_gl_texture_info; -typedef cl_uint cl_gl_platform_info; -typedef struct __GLsync *cl_GLsync; - -/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ -#define CL_GL_OBJECT_BUFFER 0x2000 -#define CL_GL_OBJECT_TEXTURE2D 0x2001 -#define CL_GL_OBJECT_TEXTURE3D 0x2002 -#define CL_GL_OBJECT_RENDERBUFFER 0x2003 -#ifdef CL_VERSION_1_2 -#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E -#define CL_GL_OBJECT_TEXTURE1D 0x200F -#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 -#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 -#endif - -/* cl_gl_texture_info */ -#define CL_GL_TEXTURE_TARGET 0x2004 -#define CL_GL_MIPMAP_LEVEL 0x2005 -#ifdef CL_VERSION_1_2 -#define CL_GL_NUM_SAMPLES 0x2012 -#endif - - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLBuffer(cl_context context, - cl_mem_flags flags, - cl_GLuint bufobj, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLTexture(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLRenderbuffer(cl_context context, - cl_mem_flags flags, - cl_GLuint renderbuffer, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLObjectInfo(cl_mem memobj, - cl_gl_object_type * gl_object_type, - cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLTextureInfo(cl_mem memobj, - cl_gl_texture_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireGLObjects(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseGLObjects(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - - -/* Deprecated OpenCL 1.1 APIs */ -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture2D(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture3D(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -/* cl_khr_gl_sharing extension */ - -#define cl_khr_gl_sharing 1 - -typedef cl_uint cl_gl_context_info; - -/* Additional Error Codes */ -#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 - -/* cl_gl_context_info */ -#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 -#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 - -/* Additional cl_context_properties */ -#define CL_GL_CONTEXT_KHR 0x2008 -#define CL_EGL_DISPLAY_KHR 0x2009 -#define CL_GLX_DISPLAY_KHR 0x200A -#define CL_WGL_HDC_KHR 0x200B -#define CL_CGL_SHAREGROUP_KHR 0x200C - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLContextInfoKHR(const cl_context_properties * properties, - cl_gl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( - const cl_context_properties * properties, - cl_gl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_GL_H */ diff --git a/CL/cl_gl_ext.h b/CL/cl_gl_ext.h deleted file mode 100644 index 9bb7540..0000000 --- a/CL/cl_gl_ext.h +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_GL_EXT_H -#define __OPENCL_CL_GL_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/* - * cl_khr_gl_event extension - */ -#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateEventFromGLsyncKHR(cl_context context, - cl_GLsync cl_GLsync, - cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/CL/cl_half.h b/CL/cl_half.h deleted file mode 100644 index f748d9e..0000000 --- a/CL/cl_half.h +++ /dev/null @@ -1,440 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2019-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -/** - * This is a header-only utility library that provides OpenCL host code with - * routines for converting to/from cl_half values. - * - * Example usage: - * - * #include - * ... - * cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE); - * cl_float f = cl_half_to_float(h); - */ - -#ifndef OPENCL_CL_HALF_H -#define OPENCL_CL_HALF_H - -#include - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -/** - * Rounding mode used when converting to cl_half. - */ -typedef enum -{ - CL_HALF_RTE, // round to nearest even - CL_HALF_RTZ, // round towards zero - CL_HALF_RTP, // round towards positive infinity - CL_HALF_RTN, // round towards negative infinity -} cl_half_rounding_mode; - - -/* Private utility macros. */ -#define CL_HALF_EXP_MASK 0x7C00 -#define CL_HALF_MAX_FINITE_MAG 0x7BFF - - -/* - * Utility to deal with values that overflow when converting to half precision. - */ -static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode, - uint16_t sign) -{ - if (rounding_mode == CL_HALF_RTZ) - { - // Round overflow towards zero -> largest finite number (preserving sign) - return (sign << 15) | CL_HALF_MAX_FINITE_MAG; - } - else if (rounding_mode == CL_HALF_RTP && sign) - { - // Round negative overflow towards positive infinity -> most negative finite number - return (1 << 15) | CL_HALF_MAX_FINITE_MAG; - } - else if (rounding_mode == CL_HALF_RTN && !sign) - { - // Round positive overflow towards negative infinity -> largest finite number - return CL_HALF_MAX_FINITE_MAG; - } - - // Overflow to infinity - return (sign << 15) | CL_HALF_EXP_MASK; -} - -/* - * Utility to deal with values that underflow when converting to half precision. - */ -static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode, - uint16_t sign) -{ - if (rounding_mode == CL_HALF_RTP && !sign) - { - // Round underflow towards positive infinity -> smallest positive value - return (sign << 15) | 1; - } - else if (rounding_mode == CL_HALF_RTN && sign) - { - // Round underflow towards negative infinity -> largest negative value - return (sign << 15) | 1; - } - - // Flush to zero - return (sign << 15); -} - - -/** - * Convert a cl_float to a cl_half. - */ -static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_float f; - uint32_t i; - } f32; - f32.f = f; - - // Extract sign bit - uint16_t sign = f32.i >> 31; - - // Extract FP32 exponent and mantissa - uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF; - uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1); - - // Remove FP32 exponent bias - int32_t exp = f_exp - CL_FLT_MAX_EXP + 1; - - // Add FP16 exponent bias - uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1; - - // Position of the bit that will become the FP16 mantissa LSB - uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG; - - // Check for NaN / infinity - if (f_exp == 0xFF) - { - if (f_mant) - { - // NaN -> propagate mantissa and silence it - uint16_t h_mant = f_mant >> lsb_pos; - h_mant |= 0x200; - return (sign << 15) | CL_HALF_EXP_MASK | h_mant; - } - else - { - // Infinity -> zero mantissa - return (sign << 15) | CL_HALF_EXP_MASK; - } - } - - // Check for zero - if (!f_exp && !f_mant) - { - return (sign << 15); - } - - // Check for overflow - if (exp >= CL_HALF_MAX_EXP) - { - return cl_half_handle_overflow(rounding_mode, sign); - } - - // Check for underflow - if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) - { - return cl_half_handle_underflow(rounding_mode, sign); - } - - // Check for value that will become denormal - if (exp < -14) - { - // Denormal -> include the implicit 1 from the FP32 mantissa - h_exp = 0; - f_mant |= 1 << (CL_FLT_MANT_DIG - 1); - - // Mantissa shift amount depends on exponent - lsb_pos = -exp + (CL_FLT_MANT_DIG - 25); - } - - // Generate FP16 mantissa by shifting FP32 mantissa - uint16_t h_mant = f_mant >> lsb_pos; - - // Check whether we need to round - uint32_t halfway = 1 << (lsb_pos - 1); - uint32_t mask = (halfway << 1) - 1; - switch (rounding_mode) - { - case CL_HALF_RTE: - if ((f_mant & mask) > halfway) - { - // More than halfway -> round up - h_mant += 1; - } - else if ((f_mant & mask) == halfway) - { - // Exactly halfway -> round to nearest even - if (h_mant & 0x1) - h_mant += 1; - } - break; - case CL_HALF_RTZ: - // Mantissa has already been truncated -> do nothing - break; - case CL_HALF_RTP: - if ((f_mant & mask) && !sign) - { - // Round positive numbers up - h_mant += 1; - } - break; - case CL_HALF_RTN: - if ((f_mant & mask) && sign) - { - // Round negative numbers down - h_mant += 1; - } - break; - } - - // Check for mantissa overflow - if (h_mant & 0x400) - { - h_exp += 1; - h_mant = 0; - } - - return (sign << 15) | (h_exp << 10) | h_mant; -} - - -/** - * Convert a cl_double to a cl_half. - */ -static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_double d; - uint64_t i; - } f64; - f64.d = d; - - // Extract sign bit - uint16_t sign = f64.i >> 63; - - // Extract FP64 exponent and mantissa - uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF; - uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1); - - // Remove FP64 exponent bias - int64_t exp = d_exp - CL_DBL_MAX_EXP + 1; - - // Add FP16 exponent bias - uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1); - - // Position of the bit that will become the FP16 mantissa LSB - uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG; - - // Check for NaN / infinity - if (d_exp == 0x7FF) - { - if (d_mant) - { - // NaN -> propagate mantissa and silence it - uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); - h_mant |= 0x200; - return (sign << 15) | CL_HALF_EXP_MASK | h_mant; - } - else - { - // Infinity -> zero mantissa - return (sign << 15) | CL_HALF_EXP_MASK; - } - } - - // Check for zero - if (!d_exp && !d_mant) - { - return (sign << 15); - } - - // Check for overflow - if (exp >= CL_HALF_MAX_EXP) - { - return cl_half_handle_overflow(rounding_mode, sign); - } - - // Check for underflow - if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) - { - return cl_half_handle_underflow(rounding_mode, sign); - } - - // Check for value that will become denormal - if (exp < -14) - { - // Include the implicit 1 from the FP64 mantissa - h_exp = 0; - d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1); - - // Mantissa shift amount depends on exponent - lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25)); - } - - // Generate FP16 mantissa by shifting FP64 mantissa - uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); - - // Check whether we need to round - uint64_t halfway = (uint64_t)1 << (lsb_pos - 1); - uint64_t mask = (halfway << 1) - 1; - switch (rounding_mode) - { - case CL_HALF_RTE: - if ((d_mant & mask) > halfway) - { - // More than halfway -> round up - h_mant += 1; - } - else if ((d_mant & mask) == halfway) - { - // Exactly halfway -> round to nearest even - if (h_mant & 0x1) - h_mant += 1; - } - break; - case CL_HALF_RTZ: - // Mantissa has already been truncated -> do nothing - break; - case CL_HALF_RTP: - if ((d_mant & mask) && !sign) - { - // Round positive numbers up - h_mant += 1; - } - break; - case CL_HALF_RTN: - if ((d_mant & mask) && sign) - { - // Round negative numbers down - h_mant += 1; - } - break; - } - - // Check for mantissa overflow - if (h_mant & 0x400) - { - h_exp += 1; - h_mant = 0; - } - - return (sign << 15) | (h_exp << 10) | h_mant; -} - - -/** - * Convert a cl_half to a cl_float. - */ -static inline cl_float cl_half_to_float(cl_half h) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_float f; - uint32_t i; - } f32; - - // Extract sign bit - uint16_t sign = h >> 15; - - // Extract FP16 exponent and mantissa - uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F; - uint16_t h_mant = h & 0x3FF; - - // Remove FP16 exponent bias - int32_t exp = h_exp - CL_HALF_MAX_EXP + 1; - - // Add FP32 exponent bias - uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1; - - // Check for NaN / infinity - if (h_exp == 0x1F) - { - if (h_mant) - { - // NaN -> propagate mantissa and silence it - uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); - f_mant |= 0x400000; - f32.i = (sign << 31) | 0x7F800000 | f_mant; - return f32.f; - } - else - { - // Infinity -> zero mantissa - f32.i = (sign << 31) | 0x7F800000; - return f32.f; - } - } - - // Check for zero / denormal - if (h_exp == 0) - { - if (h_mant == 0) - { - // Zero -> zero exponent - f_exp = 0; - } - else - { - // Denormal -> normalize it - // - Shift mantissa to make most-significant 1 implicit - // - Adjust exponent accordingly - uint32_t shift = 0; - while ((h_mant & 0x400) == 0) - { - h_mant <<= 1; - shift++; - } - h_mant &= 0x3FF; - f_exp -= shift - 1; - } - } - - f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13); - return f32.f; -} - - -#undef CL_HALF_EXP_MASK -#undef CL_HALF_MAX_FINITE_MAG - - -#ifdef __cplusplus -} -#endif - - -#endif /* OPENCL_CL_HALF_H */ diff --git a/CL/cl_icd.h b/CL/cl_icd.h deleted file mode 100644 index 8c74724..0000000 --- a/CL/cl_icd.h +++ /dev/null @@ -1,1287 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2019-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef OPENCL_CL_ICD_H -#define OPENCL_CL_ICD_H - -#include -#include -#include -#include - -#if defined(_WIN32) -#include -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This file contains pointer type definitions for each of the CL API calls as - * well as a type definition for the dispatch table used by the Khronos ICD - * loader (see cl_khr_icd extension specification for background). - */ - -/* API function pointer definitions */ - -// Platform APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)( - cl_uint num_entries, cl_platform_id *platforms, - cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)( - cl_platform_id platform, cl_platform_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Device APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)( - cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)( - cl_device_id device, cl_device_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)( - cl_device_id in_device, - const cl_device_partition_property *partition_properties, - cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateSubDevices; -typedef void *cl_api_clRetainDevice; -typedef void *cl_api_clReleaseDevice; - -#endif - -// Context APIs -typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)( - const cl_context_properties *properties, cl_uint num_devices, - const cl_device_id *devices, - void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)( - const cl_context_properties *properties, cl_device_type device_type, - void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)( - cl_context context) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)( - cl_context context) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)( - cl_context context, cl_context_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Command Queue APIs -typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)( - cl_context context, cl_device_id device, - cl_command_queue_properties properties, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef CL_API_ENTRY -cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)( - cl_context /* context */, cl_device_id /* device */, - const cl_queue_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreateCommandQueueWithProperties; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)( - cl_command_queue command_queue, cl_command_queue_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Memory Object APIs -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)( - cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - const cl_image_desc *image_desc, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateImage; - -#endif - -#ifdef CL_VERSION_3_0 - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)( - cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, - size_t size, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)( - cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -#else - -typedef void *cl_api_clCreateBufferWithProperties; -typedef void *cl_api_clCreateImageWithProperties; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)( - cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)( - cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)( - cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, - cl_uint num_entries, cl_image_format *image_formats, - cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)( - cl_mem memobj, cl_mem_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)( - cl_mem image, cl_image_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)( - cl_context /* context */, cl_mem_flags /* flags */, - cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */, - const cl_pipe_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)( - cl_mem /* pipe */, cl_pipe_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)( - cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */, - unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)( - cl_context /* context */, - void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreatePipe; -typedef void *cl_api_clGetPipeInfo; -typedef void *cl_api_clSVMAlloc; -typedef void *cl_api_clSVMFree; - -#endif - -// Sampler APIs -typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)( - cl_context context, cl_bool normalized_coords, - cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)( - cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)( - cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)( - cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef CL_API_ENTRY -cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)( - cl_context /* context */, - const cl_sampler_properties * /* sampler_properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreateSamplerWithProperties; - -#endif - -// Program Object APIs -typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)( - cl_context context, cl_uint count, const char **strings, - const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const size_t *lengths, const unsigned char **binaries, - cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY -cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateProgramWithBuiltInKernels; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)( - cl_program program) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)( - cl_program program) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)( - cl_program program, cl_uint num_devices, const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)( - cl_program program, cl_uint num_devices, const cl_device_id *device_list, - const char *options, cl_uint num_input_headers, - const cl_program *input_headers, const char **header_include_names, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const char *options, cl_uint num_input_programs, - const cl_program *input_programs, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCompileProgram; -typedef void *cl_api_clLinkProgram; - -#endif - -#ifdef CL_VERSION_2_2 - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)( - cl_program program, cl_uint spec_id, size_t spec_size, - const void *spec_value) CL_API_SUFFIX__VERSION_2_2; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)( - cl_program program, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_2_2; - -#else - -typedef void *cl_api_clSetProgramSpecializationConstant; -typedef void *cl_api_clSetProgramReleaseCallback; - -#endif - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)( - cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clUnloadPlatformCompiler; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)( - cl_program program, cl_program_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)( - cl_program program, cl_device_id device, cl_program_build_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Kernel Object APIs -typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)( - cl_program program, const char *kernel_name, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)( - cl_program program, cl_uint num_kernels, cl_kernel *kernels, - cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)( - cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)( - cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)( - cl_kernel kernel, cl_uint arg_index, size_t arg_size, - const void *arg_value) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)( - cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)( - cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clGetKernelArgInfo; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)( - cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)( - cl_kernel /* kernel */, cl_uint /* arg_index */, - const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)( - cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, - size_t /* param_value_size */, - const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)( - cl_kernel /* in_kernel */, cl_device_id /*in_device*/, - cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/, - const void * /*input_value*/, size_t /*param_value_size*/, - void * /*param_value*/, - size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clSetKernelArgSVMPointer; -typedef void *cl_api_clSetKernelExecInfo; -typedef void *cl_api_clGetKernelSubGroupInfoKHR; - -#endif - -// Event Object APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)( - cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)( - cl_event event, cl_event_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event) - CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event) - CL_API_SUFFIX__VERSION_1_0; - -// Profiling APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)( - cl_event event, cl_profiling_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Flush and Finish APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -// Enqueued Commands APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - const size_t *buffer_origin, const size_t *host_origin, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, void *ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueReadBufferRect; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, - size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - const size_t *buffer_origin, const size_t *host_origin, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueWriteBufferRect; - -#endif - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)( - cl_command_queue command_queue, cl_mem buffer, const void *pattern, - size_t pattern_size, size_t offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueFillBuffer; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, - size_t src_offset, size_t dst_offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueCopyBufferRect; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, - const size_t *origin, const size_t *region, size_t row_pitch, - size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, - const size_t *origin, const size_t *region, size_t input_row_pitch, - size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)( - cl_command_queue command_queue, cl_mem image, const void *fill_color, - const size_t origin[3], const size_t region[3], - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueFillImage; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)( - cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)( - cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, - const size_t *src_origin, const size_t *region, size_t dst_offset, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, - size_t src_offset, const size_t *dst_origin, const size_t *region, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, - cl_map_flags map_flags, size_t offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, - cl_map_flags map_flags, const size_t *origin, const size_t *region, - size_t *image_row_pitch, size_t *image_slice_pitch, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)( - cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)( - cl_command_queue command_queue, cl_uint num_mem_objects, - const cl_mem *mem_objects, cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueMigrateMemObjects; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)( - cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, - const size_t *global_work_offset, const size_t *global_work_size, - const size_t *local_work_size, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)( - cl_command_queue command_queue, cl_kernel kernel, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)( - cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *), - void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, - const void **args_mem_loc, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)( - cl_command_queue command_queue, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)( - cl_command_queue command_queue, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY void *( - CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)( - cl_platform_id platform, - const char *function_name)CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueMarkerWithWaitList; -typedef void *cl_api_clEnqueueBarrierWithWaitList; -typedef void *cl_api_clGetExtensionFunctionAddressForPlatform; - -#endif - -// Shared Virtual Memory APIs - -#ifdef CL_VERSION_2_0 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)( - cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */, - void ** /* svm_pointers */, - void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */, - cl_uint /* num_svm_pointers */, - void ** /* svm_pointers[] */, - void * /* user_data */), - void * /* user_data */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)( - cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, - void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, - const void * /* pattern */, size_t /* pattern_size */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)( - cl_command_queue /* command_queue */, cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clEnqueueSVMFree; -typedef void *cl_api_clEnqueueSVMMemcpy; -typedef void *cl_api_clEnqueueSVMMemFill; -typedef void *cl_api_clEnqueueSVMMap; -typedef void *cl_api_clEnqueueSVMUnmap; - -#endif - -// Deprecated APIs -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)( - cl_command_queue command_queue, cl_command_queue_properties properties, - cl_bool enable, cl_command_queue_properties *old_properties) - CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - size_t image_width, size_t image_height, size_t image_row_pitch, - void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - size_t image_width, size_t image_height, size_t image_depth, - size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr, - cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void) - CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)( - cl_command_queue command_queue, - cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)( - cl_command_queue command_queue, cl_uint num_events, - const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)( - cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)( - const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -// GL and other APIs -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)( - cl_context context, cl_mem_flags flags, cl_GLuint bufobj, - int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)( - cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)( - cl_mem memobj, cl_gl_object_type *gl_object_type, - cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)( - cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -/* cl_khr_gl_sharing */ -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)( - const cl_context_properties *properties, cl_gl_context_info param_name, - size_t param_value_size, void *param_value, size_t *param_value_size_ret); - -/* cl_khr_gl_event */ -typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)( - cl_context context, cl_GLsync sync, cl_int *errcode_ret); - -#if defined(_WIN32) - -/* cl_khr_d3d10_sharing */ - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)( - cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, - cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D10Buffer *resource, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)( - cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)( - cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR( - cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags, - ID3D10Buffer *resource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR( - cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR( - cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_d3d11_sharing */ -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)( - cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, - cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D11Buffer *resource, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)( - cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)( - cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -/* cl_khr_dx9_media_sharing */ -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)( - cl_platform_id platform, cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)( - cl_context context, cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, - cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -/* cl_khr_d3d11_sharing */ -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR( - cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags, - ID3D11Buffer *resource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR( - cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR( - cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_dx9_media_sharing */ -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR( - cl_platform_id platform, cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR( - cl_context context, cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, - cl_uint plane, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -#else - -/* cl_khr_d3d10_sharing */ -typedef void *cl_api_clGetDeviceIDsFromD3D10KHR; -typedef void *cl_api_clCreateFromD3D10BufferKHR; -typedef void *cl_api_clCreateFromD3D10Texture2DKHR; -typedef void *cl_api_clCreateFromD3D10Texture3DKHR; -typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR; -typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR; - -/* cl_khr_d3d11_sharing */ -typedef void *cl_api_clGetDeviceIDsFromD3D11KHR; -typedef void *cl_api_clCreateFromD3D11BufferKHR; -typedef void *cl_api_clCreateFromD3D11Texture2DKHR; -typedef void *cl_api_clCreateFromD3D11Texture3DKHR; -typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR; -typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR; - -/* cl_khr_dx9_media_sharing */ -typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR; -typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR; -typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR; -typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR; - -#endif - -/* OpenCL 1.1 */ - -#ifdef CL_VERSION_1_1 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)( - cl_event /* event */, cl_int /* command_exec_callback_type */, - void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)( - cl_mem /* buffer */, cl_mem_flags /* flags */, - cl_buffer_create_type /* buffer_create_type */, - const void * /* buffer_create_info */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY -cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)( - cl_mem /* memobj */, - void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, - void * /*user_data*/), - void * /*user_data */) CL_API_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)( - cl_context /* context */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)( - cl_event /* event */, - cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clSetEventCallback; -typedef void *cl_api_clCreateSubBuffer; -typedef void *cl_api_clSetMemObjectDestructorCallback; -typedef void *cl_api_clCreateUserEvent; -typedef void *cl_api_clSetUserEventStatus; - -#endif - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)( - cl_device_id in_device, - const cl_device_partition_property_ext *partition_properties, - cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_0; - -/* cl_khr_egl_image */ -typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)( - cl_context context, CLeglDisplayKHR display, CLeglImageKHR image, - cl_mem_flags flags, const cl_egl_image_properties_khr *properties, - cl_int *errcode_ret); - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_egl_event */ -typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)( - cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, - cl_int *errcode_ret); - -#ifdef CL_VERSION_2_1 - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)( - cl_context context, cl_device_id device, - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)( - cl_context context, const void *il, size_t length, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)( - cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name, - size_t input_value_size, const void *input_value, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)( - cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)( - cl_command_queue command_queue, cl_uint num_svm_pointers, - const void **svm_pointers, const size_t *sizes, - cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)( - cl_device_id device, cl_ulong *device_timestamp, - cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)( - cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -#else - -typedef void *cl_api_clSetDefaultDeviceCommandQueue; -typedef void *cl_api_clCreateProgramWithIL; -typedef void *cl_api_clGetKernelSubGroupInfo; -typedef void *cl_api_clCloneKernel; -typedef void *cl_api_clEnqueueSVMMigrateMem; -typedef void *cl_api_clGetDeviceAndHostTimer; -typedef void *cl_api_clGetHostTimer; - -#endif - -/* Vendor dispatch table struture */ - -typedef struct _cl_icd_dispatch { - /* OpenCL 1.0 */ - cl_api_clGetPlatformIDs clGetPlatformIDs; - cl_api_clGetPlatformInfo clGetPlatformInfo; - cl_api_clGetDeviceIDs clGetDeviceIDs; - cl_api_clGetDeviceInfo clGetDeviceInfo; - cl_api_clCreateContext clCreateContext; - cl_api_clCreateContextFromType clCreateContextFromType; - cl_api_clRetainContext clRetainContext; - cl_api_clReleaseContext clReleaseContext; - cl_api_clGetContextInfo clGetContextInfo; - cl_api_clCreateCommandQueue clCreateCommandQueue; - cl_api_clRetainCommandQueue clRetainCommandQueue; - cl_api_clReleaseCommandQueue clReleaseCommandQueue; - cl_api_clGetCommandQueueInfo clGetCommandQueueInfo; - cl_api_clSetCommandQueueProperty clSetCommandQueueProperty; - cl_api_clCreateBuffer clCreateBuffer; - cl_api_clCreateImage2D clCreateImage2D; - cl_api_clCreateImage3D clCreateImage3D; - cl_api_clRetainMemObject clRetainMemObject; - cl_api_clReleaseMemObject clReleaseMemObject; - cl_api_clGetSupportedImageFormats clGetSupportedImageFormats; - cl_api_clGetMemObjectInfo clGetMemObjectInfo; - cl_api_clGetImageInfo clGetImageInfo; - cl_api_clCreateSampler clCreateSampler; - cl_api_clRetainSampler clRetainSampler; - cl_api_clReleaseSampler clReleaseSampler; - cl_api_clGetSamplerInfo clGetSamplerInfo; - cl_api_clCreateProgramWithSource clCreateProgramWithSource; - cl_api_clCreateProgramWithBinary clCreateProgramWithBinary; - cl_api_clRetainProgram clRetainProgram; - cl_api_clReleaseProgram clReleaseProgram; - cl_api_clBuildProgram clBuildProgram; - cl_api_clUnloadCompiler clUnloadCompiler; - cl_api_clGetProgramInfo clGetProgramInfo; - cl_api_clGetProgramBuildInfo clGetProgramBuildInfo; - cl_api_clCreateKernel clCreateKernel; - cl_api_clCreateKernelsInProgram clCreateKernelsInProgram; - cl_api_clRetainKernel clRetainKernel; - cl_api_clReleaseKernel clReleaseKernel; - cl_api_clSetKernelArg clSetKernelArg; - cl_api_clGetKernelInfo clGetKernelInfo; - cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; - cl_api_clWaitForEvents clWaitForEvents; - cl_api_clGetEventInfo clGetEventInfo; - cl_api_clRetainEvent clRetainEvent; - cl_api_clReleaseEvent clReleaseEvent; - cl_api_clGetEventProfilingInfo clGetEventProfilingInfo; - cl_api_clFlush clFlush; - cl_api_clFinish clFinish; - cl_api_clEnqueueReadBuffer clEnqueueReadBuffer; - cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer; - cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer; - cl_api_clEnqueueReadImage clEnqueueReadImage; - cl_api_clEnqueueWriteImage clEnqueueWriteImage; - cl_api_clEnqueueCopyImage clEnqueueCopyImage; - cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; - cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; - cl_api_clEnqueueMapBuffer clEnqueueMapBuffer; - cl_api_clEnqueueMapImage clEnqueueMapImage; - cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; - cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; - cl_api_clEnqueueTask clEnqueueTask; - cl_api_clEnqueueNativeKernel clEnqueueNativeKernel; - cl_api_clEnqueueMarker clEnqueueMarker; - cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents; - cl_api_clEnqueueBarrier clEnqueueBarrier; - cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; - cl_api_clCreateFromGLBuffer clCreateFromGLBuffer; - cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D; - cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D; - cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer; - cl_api_clGetGLObjectInfo clGetGLObjectInfo; - cl_api_clGetGLTextureInfo clGetGLTextureInfo; - cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; - cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; - cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR; - - /* cl_khr_d3d10_sharing */ - cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR; - cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR; - cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR; - cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR; - cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR; - cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR; - - /* OpenCL 1.1 */ - cl_api_clSetEventCallback clSetEventCallback; - cl_api_clCreateSubBuffer clCreateSubBuffer; - cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; - cl_api_clCreateUserEvent clCreateUserEvent; - cl_api_clSetUserEventStatus clSetUserEventStatus; - cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect; - cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; - cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; - - /* cl_ext_device_fission */ - cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT; - cl_api_clRetainDeviceEXT clRetainDeviceEXT; - cl_api_clReleaseDeviceEXT clReleaseDeviceEXT; - - /* cl_khr_gl_event */ - cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR; - - /* OpenCL 1.2 */ - cl_api_clCreateSubDevices clCreateSubDevices; - cl_api_clRetainDevice clRetainDevice; - cl_api_clReleaseDevice clReleaseDevice; - cl_api_clCreateImage clCreateImage; - cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; - cl_api_clCompileProgram clCompileProgram; - cl_api_clLinkProgram clLinkProgram; - cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler; - cl_api_clGetKernelArgInfo clGetKernelArgInfo; - cl_api_clEnqueueFillBuffer clEnqueueFillBuffer; - cl_api_clEnqueueFillImage clEnqueueFillImage; - cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; - cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; - cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; - cl_api_clGetExtensionFunctionAddressForPlatform - clGetExtensionFunctionAddressForPlatform; - cl_api_clCreateFromGLTexture clCreateFromGLTexture; - - /* cl_khr_d3d11_sharing */ - cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR; - cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR; - cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR; - cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR; - cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR; - cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR; - cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR; - - /* cl_khr_dx9_media_sharing */ - cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR - clGetDeviceIDsFromDX9MediaAdapterKHR; - cl_api_clEnqueueAcquireDX9MediaSurfacesKHR - clEnqueueAcquireDX9MediaSurfacesKHR; - cl_api_clEnqueueReleaseDX9MediaSurfacesKHR - clEnqueueReleaseDX9MediaSurfacesKHR; - - /* cl_khr_egl_image */ - cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; - cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; - cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; - - /* cl_khr_egl_event */ - cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; - - /* OpenCL 2.0 */ - cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; - cl_api_clCreatePipe clCreatePipe; - cl_api_clGetPipeInfo clGetPipeInfo; - cl_api_clSVMAlloc clSVMAlloc; - cl_api_clSVMFree clSVMFree; - cl_api_clEnqueueSVMFree clEnqueueSVMFree; - cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; - cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill; - cl_api_clEnqueueSVMMap clEnqueueSVMMap; - cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap; - cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties; - cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; - cl_api_clSetKernelExecInfo clSetKernelExecInfo; - - /* cl_khr_sub_groups */ - cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR; - - /* OpenCL 2.1 */ - cl_api_clCloneKernel clCloneKernel; - cl_api_clCreateProgramWithIL clCreateProgramWithIL; - cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem; - cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer; - cl_api_clGetHostTimer clGetHostTimer; - cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo; - cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue; - - /* OpenCL 2.2 */ - cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback; - cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant; - - /* OpenCL 3.0 */ - cl_api_clCreateBufferWithProperties clCreateBufferWithProperties; - cl_api_clCreateImageWithProperties clCreateImageWithProperties; - -} cl_icd_dispatch; - -#ifdef __cplusplus -} -#endif - -#endif /* #ifndef OPENCL_CL_ICD_H */ diff --git a/CL/cl_platform.h b/CL/cl_platform.h deleted file mode 100644 index 1bd7d4b..0000000 --- a/CL/cl_platform.h +++ /dev/null @@ -1,1384 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __CL_PLATFORM_H -#define __CL_PLATFORM_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_WIN32) - #define CL_API_ENTRY - #define CL_API_CALL __stdcall - #define CL_CALLBACK __stdcall -#else - #define CL_API_ENTRY - #define CL_API_CALL - #define CL_CALLBACK -#endif - -/* - * Deprecation flags refer to the last version of the header in which the - * feature was not deprecated. - * - * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without - * deprecation but is deprecated in versions later than 1.1. - */ - -#define CL_EXTENSION_WEAK_LINK -#define CL_API_SUFFIX__VERSION_1_0 -#define CL_EXT_SUFFIX__VERSION_1_0 -#define CL_API_SUFFIX__VERSION_1_1 -#define CL_EXT_SUFFIX__VERSION_1_1 -#define CL_API_SUFFIX__VERSION_1_2 -#define CL_EXT_SUFFIX__VERSION_1_2 -#define CL_API_SUFFIX__VERSION_2_0 -#define CL_EXT_SUFFIX__VERSION_2_0 -#define CL_API_SUFFIX__VERSION_2_1 -#define CL_EXT_SUFFIX__VERSION_2_1 -#define CL_API_SUFFIX__VERSION_2_2 -#define CL_EXT_SUFFIX__VERSION_2_2 -#define CL_API_SUFFIX__VERSION_3_0 -#define CL_EXT_SUFFIX__VERSION_3_0 -#define CL_API_SUFFIX__EXPERIMENTAL -#define CL_EXT_SUFFIX__EXPERIMENTAL - - -#ifdef __GNUC__ - #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX_DEPRECATED -#elif defined(_WIN32) - #define CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated) -#else - #define CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED - #endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS - #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED -#else - #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED -#endif - -#if (defined (_WIN32) && defined(_MSC_VER)) - -/* scalar types */ -typedef signed __int8 cl_char; -typedef unsigned __int8 cl_uchar; -typedef signed __int16 cl_short; -typedef unsigned __int16 cl_ushort; -typedef signed __int32 cl_int; -typedef unsigned __int32 cl_uint; -typedef signed __int64 cl_long; -typedef unsigned __int64 cl_ulong; - -typedef unsigned __int16 cl_half; -typedef float cl_float; -typedef double cl_double; - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 340282346638528859811704183484516925440.0f -#define CL_FLT_MIN 1.175494350822287507969e-38f -#define CL_FLT_EPSILON 1.1920928955078125e-7f - -#define CL_HALF_DIG 3 -#define CL_HALF_MANT_DIG 11 -#define CL_HALF_MAX_10_EXP +4 -#define CL_HALF_MAX_EXP +16 -#define CL_HALF_MIN_10_EXP -4 -#define CL_HALF_MIN_EXP -13 -#define CL_HALF_RADIX 2 -#define CL_HALF_MAX 65504.0f -#define CL_HALF_MIN 6.103515625e-05f -#define CL_HALF_EPSILON 9.765625e-04f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 1.7976931348623158e+308 -#define CL_DBL_MIN 2.225073858507201383090e-308 -#define CL_DBL_EPSILON 2.220446049250313080847e-16 - -#define CL_M_E 2.7182818284590452354 -#define CL_M_LOG2E 1.4426950408889634074 -#define CL_M_LOG10E 0.43429448190325182765 -#define CL_M_LN2 0.69314718055994530942 -#define CL_M_LN10 2.30258509299404568402 -#define CL_M_PI 3.14159265358979323846 -#define CL_M_PI_2 1.57079632679489661923 -#define CL_M_PI_4 0.78539816339744830962 -#define CL_M_1_PI 0.31830988618379067154 -#define CL_M_2_PI 0.63661977236758134308 -#define CL_M_2_SQRTPI 1.12837916709551257390 -#define CL_M_SQRT2 1.41421356237309504880 -#define CL_M_SQRT1_2 0.70710678118654752440 - -#define CL_M_E_F 2.718281828f -#define CL_M_LOG2E_F 1.442695041f -#define CL_M_LOG10E_F 0.434294482f -#define CL_M_LN2_F 0.693147181f -#define CL_M_LN10_F 2.302585093f -#define CL_M_PI_F 3.141592654f -#define CL_M_PI_2_F 1.570796327f -#define CL_M_PI_4_F 0.785398163f -#define CL_M_1_PI_F 0.318309886f -#define CL_M_2_PI_F 0.636619772f -#define CL_M_2_SQRTPI_F 1.128379167f -#define CL_M_SQRT2_F 1.414213562f -#define CL_M_SQRT1_2_F 0.707106781f - -#define CL_NAN (CL_INFINITY - CL_INFINITY) -#define CL_HUGE_VALF ((cl_float) 1e50) -#define CL_HUGE_VAL ((cl_double) 1e500) -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#else - -#include - -/* scalar types */ -typedef int8_t cl_char; -typedef uint8_t cl_uchar; -typedef int16_t cl_short; -typedef uint16_t cl_ushort; -typedef int32_t cl_int; -typedef uint32_t cl_uint; -typedef int64_t cl_long; -typedef uint64_t cl_ulong; - -typedef uint16_t cl_half; -typedef float cl_float; -typedef double cl_double; - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 340282346638528859811704183484516925440.0f -#define CL_FLT_MIN 1.175494350822287507969e-38f -#define CL_FLT_EPSILON 1.1920928955078125e-7f - -#define CL_HALF_DIG 3 -#define CL_HALF_MANT_DIG 11 -#define CL_HALF_MAX_10_EXP +4 -#define CL_HALF_MAX_EXP +16 -#define CL_HALF_MIN_10_EXP -4 -#define CL_HALF_MIN_EXP -13 -#define CL_HALF_RADIX 2 -#define CL_HALF_MAX 65504.0f -#define CL_HALF_MIN 6.103515625e-05f -#define CL_HALF_EPSILON 9.765625e-04f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 -#define CL_DBL_MIN 2.225073858507201383090e-308 -#define CL_DBL_EPSILON 2.220446049250313080847e-16 - -#define CL_M_E 2.7182818284590452354 -#define CL_M_LOG2E 1.4426950408889634074 -#define CL_M_LOG10E 0.43429448190325182765 -#define CL_M_LN2 0.69314718055994530942 -#define CL_M_LN10 2.30258509299404568402 -#define CL_M_PI 3.14159265358979323846 -#define CL_M_PI_2 1.57079632679489661923 -#define CL_M_PI_4 0.78539816339744830962 -#define CL_M_1_PI 0.31830988618379067154 -#define CL_M_2_PI 0.63661977236758134308 -#define CL_M_2_SQRTPI 1.12837916709551257390 -#define CL_M_SQRT2 1.41421356237309504880 -#define CL_M_SQRT1_2 0.70710678118654752440 - -#define CL_M_E_F 2.718281828f -#define CL_M_LOG2E_F 1.442695041f -#define CL_M_LOG10E_F 0.434294482f -#define CL_M_LN2_F 0.693147181f -#define CL_M_LN10_F 2.302585093f -#define CL_M_PI_F 3.141592654f -#define CL_M_PI_2_F 1.570796327f -#define CL_M_PI_4_F 0.785398163f -#define CL_M_1_PI_F 0.318309886f -#define CL_M_2_PI_F 0.636619772f -#define CL_M_2_SQRTPI_F 1.128379167f -#define CL_M_SQRT2_F 1.414213562f -#define CL_M_SQRT1_2_F 0.707106781f - -#if defined( __GNUC__ ) - #define CL_HUGE_VALF __builtin_huge_valf() - #define CL_HUGE_VAL __builtin_huge_val() - #define CL_NAN __builtin_nanf( "" ) -#else - #define CL_HUGE_VALF ((cl_float) 1e50) - #define CL_HUGE_VAL ((cl_double) 1e500) - float nanf( const char * ); - #define CL_NAN nanf( "" ) -#endif -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#endif - -#include - -/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ -typedef unsigned int cl_GLuint; -typedef int cl_GLint; -typedef unsigned int cl_GLenum; - -/* - * Vector types - * - * Note: OpenCL requires that all types be naturally aligned. - * This means that vector types must be naturally aligned. - * For example, a vector of four floats must be aligned to - * a 16 byte boundary (calculated as 4 * the natural 4-byte - * alignment of the float). The alignment qualifiers here - * will only function properly if your compiler supports them - * and if you don't actively work to defeat them. For example, - * in order for a cl_float4 to be 16 byte aligned in a struct, - * the start of the struct must itself be 16-byte aligned. - * - * Maintaining proper alignment is the user's responsibility. - */ - -/* Define basic vector types */ -#if defined( __VEC__ ) - #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ - typedef __vector unsigned char __cl_uchar16; - typedef __vector signed char __cl_char16; - typedef __vector unsigned short __cl_ushort8; - typedef __vector signed short __cl_short8; - typedef __vector unsigned int __cl_uint4; - typedef __vector signed int __cl_int4; - typedef __vector float __cl_float4; - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_UINT4__ 1 - #define __CL_INT4__ 1 - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef float __cl_float4 __attribute__((vector_size(16))); - #else - typedef __m128 __cl_float4; - #endif - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE2__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); - typedef cl_char __cl_char16 __attribute__((vector_size(16))); - typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); - typedef cl_short __cl_short8 __attribute__((vector_size(16))); - typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); - typedef cl_int __cl_int4 __attribute__((vector_size(16))); - typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); - typedef cl_long __cl_long2 __attribute__((vector_size(16))); - typedef cl_double __cl_double2 __attribute__((vector_size(16))); - #else - typedef __m128i __cl_uchar16; - typedef __m128i __cl_char16; - typedef __m128i __cl_ushort8; - typedef __m128i __cl_short8; - typedef __m128i __cl_uint4; - typedef __m128i __cl_int4; - typedef __m128i __cl_ulong2; - typedef __m128i __cl_long2; - typedef __m128d __cl_double2; - #endif - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_INT4__ 1 - #define __CL_UINT4__ 1 - #define __CL_ULONG2__ 1 - #define __CL_LONG2__ 1 - #define __CL_DOUBLE2__ 1 -#endif - -#if defined( __MMX__ ) - #include - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); - typedef cl_char __cl_char8 __attribute__((vector_size(8))); - typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); - typedef cl_short __cl_short4 __attribute__((vector_size(8))); - typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); - typedef cl_int __cl_int2 __attribute__((vector_size(8))); - typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); - typedef cl_long __cl_long1 __attribute__((vector_size(8))); - typedef cl_float __cl_float2 __attribute__((vector_size(8))); - #else - typedef __m64 __cl_uchar8; - typedef __m64 __cl_char8; - typedef __m64 __cl_ushort4; - typedef __m64 __cl_short4; - typedef __m64 __cl_uint2; - typedef __m64 __cl_int2; - typedef __m64 __cl_ulong1; - typedef __m64 __cl_long1; - typedef __m64 __cl_float2; - #endif - #define __CL_UCHAR8__ 1 - #define __CL_CHAR8__ 1 - #define __CL_USHORT4__ 1 - #define __CL_SHORT4__ 1 - #define __CL_INT2__ 1 - #define __CL_UINT2__ 1 - #define __CL_ULONG1__ 1 - #define __CL_LONG1__ 1 - #define __CL_FLOAT2__ 1 -#endif - -#if defined( __AVX__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_float __cl_float8 __attribute__((vector_size(32))); - typedef cl_double __cl_double4 __attribute__((vector_size(32))); - #else - typedef __m256 __cl_float8; - typedef __m256d __cl_double4; - #endif - #define __CL_FLOAT8__ 1 - #define __CL_DOUBLE4__ 1 -#endif - -/* Define capabilities for anonymous struct members. */ -#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ -#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ __extension__ -#elif defined( _WIN32) && defined(_MSC_VER) - #if _MSC_VER >= 1500 - /* Microsoft Developer Studio 2008 supports anonymous structs, but - * complains by default. */ - #define __CL_HAS_ANON_STRUCT__ 1 - #define __CL_ANON_STRUCT__ - /* Disable warning C4201: nonstandard extension used : nameless - * struct/union */ - #pragma warning( push ) - #pragma warning( disable : 4201 ) - #endif -#else -#define __CL_HAS_ANON_STRUCT__ 0 -#define __CL_ANON_STRUCT__ -#endif - -/* Define alignment keys */ -#if defined( __GNUC__ ) - #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) -#elif defined( _WIN32) && (_MSC_VER) - /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ - /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ - /* #include */ - /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ - #define CL_ALIGNED(_x) -#else - #warning Need to implement some method to align data here - #define CL_ALIGNED(_x) -#endif - -/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ -#if __CL_HAS_ANON_STRUCT__ - /* .xyzw and .s0123...{f|F} are supported */ - #define CL_HAS_NAMED_VECTOR_FIELDS 1 - /* .hi and .lo are supported */ - #define CL_HAS_HI_LO_VECTOR_FIELDS 1 -#endif - -/* Define cl_vector types */ - -/* ---- cl_charn ---- */ -typedef union -{ - cl_char CL_ALIGNED(2) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2; -#endif -}cl_char2; - -typedef union -{ - cl_char CL_ALIGNED(4) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[2]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4; -#endif -}cl_char4; - -/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ -typedef cl_char4 cl_char3; - -typedef union -{ - cl_char CL_ALIGNED(8) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[4]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[2]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8; -#endif -}cl_char8; - -typedef union -{ - cl_char CL_ALIGNED(16) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[8]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[4]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8[2]; -#endif -#if defined( __CL_CHAR16__ ) - __cl_char16 v16; -#endif -}cl_char16; - - -/* ---- cl_ucharn ---- */ -typedef union -{ - cl_uchar CL_ALIGNED(2) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; -#endif -#if defined( __cl_uchar2__) - __cl_uchar2 v2; -#endif -}cl_uchar2; - -typedef union -{ - cl_uchar CL_ALIGNED(4) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[2]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4; -#endif -}cl_uchar4; - -/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ -typedef cl_uchar4 cl_uchar3; - -typedef union -{ - cl_uchar CL_ALIGNED(8) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[4]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[2]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8; -#endif -}cl_uchar8; - -typedef union -{ - cl_uchar CL_ALIGNED(16) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[8]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[4]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8[2]; -#endif -#if defined( __CL_UCHAR16__ ) - __cl_uchar16 v16; -#endif -}cl_uchar16; - - -/* ---- cl_shortn ---- */ -typedef union -{ - cl_short CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2; -#endif -}cl_short2; - -typedef union -{ - cl_short CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[2]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4; -#endif -}cl_short4; - -/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ -typedef cl_short4 cl_short3; - -typedef union -{ - cl_short CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[4]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[2]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8; -#endif -}cl_short8; - -typedef union -{ - cl_short CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[8]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[4]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8[2]; -#endif -#if defined( __CL_SHORT16__ ) - __cl_short16 v16; -#endif -}cl_short16; - - -/* ---- cl_ushortn ---- */ -typedef union -{ - cl_ushort CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2; -#endif -}cl_ushort2; - -typedef union -{ - cl_ushort CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[2]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4; -#endif -}cl_ushort4; - -/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ -typedef cl_ushort4 cl_ushort3; - -typedef union -{ - cl_ushort CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[4]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[2]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8; -#endif -}cl_ushort8; - -typedef union -{ - cl_ushort CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[8]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[4]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8[2]; -#endif -#if defined( __CL_USHORT16__ ) - __cl_ushort16 v16; -#endif -}cl_ushort16; - - -/* ---- cl_halfn ---- */ -typedef union -{ - cl_half CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2; -#endif -}cl_half2; - -typedef union -{ - cl_half CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[2]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4; -#endif -}cl_half4; - -/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ -typedef cl_half4 cl_half3; - -typedef union -{ - cl_half CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[4]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4[2]; -#endif -#if defined( __CL_HALF8__ ) - __cl_half8 v8; -#endif -}cl_half8; - -typedef union -{ - cl_half CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[8]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4[4]; -#endif -#if defined( __CL_HALF8__ ) - __cl_half8 v8[2]; -#endif -#if defined( __CL_HALF16__ ) - __cl_half16 v16; -#endif -}cl_half16; - -/* ---- cl_intn ---- */ -typedef union -{ - cl_int CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2; -#endif -}cl_int2; - -typedef union -{ - cl_int CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[2]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4; -#endif -}cl_int4; - -/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ -typedef cl_int4 cl_int3; - -typedef union -{ - cl_int CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[4]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[2]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8; -#endif -}cl_int8; - -typedef union -{ - cl_int CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[8]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[4]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8[2]; -#endif -#if defined( __CL_INT16__ ) - __cl_int16 v16; -#endif -}cl_int16; - - -/* ---- cl_uintn ---- */ -typedef union -{ - cl_uint CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2; -#endif -}cl_uint2; - -typedef union -{ - cl_uint CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[2]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4; -#endif -}cl_uint4; - -/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ -typedef cl_uint4 cl_uint3; - -typedef union -{ - cl_uint CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[4]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[2]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8; -#endif -}cl_uint8; - -typedef union -{ - cl_uint CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[8]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[4]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8[2]; -#endif -#if defined( __CL_UINT16__ ) - __cl_uint16 v16; -#endif -}cl_uint16; - -/* ---- cl_longn ---- */ -typedef union -{ - cl_long CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2; -#endif -}cl_long2; - -typedef union -{ - cl_long CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[2]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4; -#endif -}cl_long4; - -/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ -typedef cl_long4 cl_long3; - -typedef union -{ - cl_long CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[4]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[2]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8; -#endif -}cl_long8; - -typedef union -{ - cl_long CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[8]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[4]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8[2]; -#endif -#if defined( __CL_LONG16__ ) - __cl_long16 v16; -#endif -}cl_long16; - - -/* ---- cl_ulongn ---- */ -typedef union -{ - cl_ulong CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2; -#endif -}cl_ulong2; - -typedef union -{ - cl_ulong CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[2]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4; -#endif -}cl_ulong4; - -/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ -typedef cl_ulong4 cl_ulong3; - -typedef union -{ - cl_ulong CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[4]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[2]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8; -#endif -}cl_ulong8; - -typedef union -{ - cl_ulong CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[8]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[4]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8[2]; -#endif -#if defined( __CL_ULONG16__ ) - __cl_ulong16 v16; -#endif -}cl_ulong16; - - -/* --- cl_floatn ---- */ - -typedef union -{ - cl_float CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2; -#endif -}cl_float2; - -typedef union -{ - cl_float CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[2]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4; -#endif -}cl_float4; - -/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ -typedef cl_float4 cl_float3; - -typedef union -{ - cl_float CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[4]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[2]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8; -#endif -}cl_float8; - -typedef union -{ - cl_float CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[8]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[4]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8[2]; -#endif -#if defined( __CL_FLOAT16__ ) - __cl_float16 v16; -#endif -}cl_float16; - -/* --- cl_doublen ---- */ - -typedef union -{ - cl_double CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2; -#endif -}cl_double2; - -typedef union -{ - cl_double CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[2]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4; -#endif -}cl_double4; - -/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ -typedef cl_double4 cl_double3; - -typedef union -{ - cl_double CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[4]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[2]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8; -#endif -}cl_double8; - -typedef union -{ - cl_double CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[8]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[4]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8[2]; -#endif -#if defined( __CL_DOUBLE16__ ) - __cl_double16 v16; -#endif -}cl_double16; - -/* Macro to facilitate debugging - * Usage: - * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. - * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" - * Each line thereafter of OpenCL C source must end with: \n\ - * The last line ends in "; - * - * Example: - * - * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ - * kernel void foo( int a, float * b ) \n\ - * { \n\ - * // my comment \n\ - * *b[ get_global_id(0)] = a; \n\ - * } \n\ - * "; - * - * This should correctly set up the line, (column) and file information for your source - * string so you can do source level debugging. - */ -#define __CL_STRINGIFY( _x ) # _x -#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) -#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" - -#ifdef __cplusplus -} -#endif - -#undef __CL_HAS_ANON_STRUCT__ -#undef __CL_ANON_STRUCT__ -#if defined( _WIN32) && defined(_MSC_VER) - #if _MSC_VER >=1500 - #pragma warning( pop ) - #endif -#endif - -#endif /* __CL_PLATFORM_H */ diff --git a/CL/cl_va_api_media_sharing_intel.h b/CL/cl_va_api_media_sharing_intel.h deleted file mode 100644 index 0e7cd4d..0000000 --- a/CL/cl_va_api_media_sharing_intel.h +++ /dev/null @@ -1,160 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -/*****************************************************************************\ - -Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. - -THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE -MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -File Name: cl_va_api_media_sharing_intel.h - -Abstract: - -Notes: - -\*****************************************************************************/ - - -#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H -#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************** -* cl_intel_va_api_media_sharing extension * -*******************************************/ - -#define cl_intel_va_api_media_sharing 1 - -/* error codes */ -#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 -#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 -#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 -#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 - -/* cl_va_api_device_source_intel */ -#define CL_VA_API_DISPLAY_INTEL 0x4094 - -/* cl_va_api_device_set_intel */ -#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 -#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 - -/* cl_context_info */ -#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 - -/* cl_mem_info */ -#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 - -/* cl_image_info */ -#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A -#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B - -typedef cl_uint cl_va_api_device_source_intel; -typedef cl_uint cl_va_api_device_set_intel; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDsFromVA_APIMediaAdapterINTEL( - cl_platform_id platform, - cl_va_api_device_source_intel media_adapter_type, - void* media_adapter, - cl_va_api_device_set_intel media_adapter_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( - cl_platform_id platform, - cl_va_api_device_source_intel media_adapter_type, - void* media_adapter, - cl_va_api_device_set_intel media_adapter_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromVA_APIMediaSurfaceINTEL( - cl_context context, - cl_mem_flags flags, - VASurfaceID* surface, - cl_uint plane, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( - cl_context context, - cl_mem_flags flags, - VASurfaceID* surface, - cl_uint plane, - cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireVA_APIMediaSurfacesINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseVA_APIMediaSurfacesINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ - diff --git a/CL/cl_version.h b/CL/cl_version.h deleted file mode 100644 index f38280a..0000000 --- a/CL/cl_version.h +++ /dev/null @@ -1,81 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2018-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __CL_VERSION_H -#define __CL_VERSION_H - -/* Detect which version to target */ -#if !defined(CL_TARGET_OPENCL_VERSION) -#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)") -#define CL_TARGET_OPENCL_VERSION 220 -#endif -#if CL_TARGET_OPENCL_VERSION != 100 && \ - CL_TARGET_OPENCL_VERSION != 110 && \ - CL_TARGET_OPENCL_VERSION != 120 && \ - CL_TARGET_OPENCL_VERSION != 200 && \ - CL_TARGET_OPENCL_VERSION != 210 && \ - CL_TARGET_OPENCL_VERSION != 220 && \ - CL_TARGET_OPENCL_VERSION != 300 -#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)") -#undef CL_TARGET_OPENCL_VERSION -#define CL_TARGET_OPENCL_VERSION 220 -#endif - - -/* OpenCL Version */ -#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0) -#define CL_VERSION_3_0 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) -#define CL_VERSION_2_2 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) -#define CL_VERSION_2_1 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) -#define CL_VERSION_2_0 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) -#define CL_VERSION_1_2 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) -#define CL_VERSION_1_1 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) -#define CL_VERSION_1_0 1 -#endif - -/* Allow deprecated APIs for older OpenCL versions. */ -#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_2_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_1_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_0_APIS -#endif - -#endif /* __CL_VERSION_H */ diff --git a/CL/opencl.h b/CL/opencl.h deleted file mode 100644 index 1c4e10c..0000000 --- a/CL/opencl.h +++ /dev/null @@ -1,33 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_H -#define __OPENCL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_H */ diff --git a/CMakeLists.txt b/CMakeLists.txt index 797a9c0..ad12dbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,21 @@ project(BabelStream VERSION 3.5 LANGUAGES CXX) # some nicer defaults for standard C++ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) +include(FetchContent) +FetchContent_Declare( + opencl_header + URL https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v2021.06.30.zip + URL_HASH MD5=af7ab7918a6a11c60370c8651a9f0192 +) + +macro(setup_opencl_header_includes) + FetchContent_GetProperties(opencl_header) + if (NOT opencl_header_POPULATED) + FetchContent_Populate(opencl_header) + set(OpenCL_INCLUDE_DIR ${opencl_header_SOURCE_DIR}) + endif () +endmacro() #set(MODEL SYCL) #set(SYCL_COMPILER COMPUTECPP) @@ -35,7 +49,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) #set(CXX_EXTRA_FLAGS -O2) #set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++) -#set(MODEL OMP) +#set(MODEL omp) ##set(OFFLOAD "AMD:gfx803") #set(OFFLOAD "NVIDIA:sm_35") #set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/) @@ -98,21 +112,23 @@ if ((DEFINED CXX_EXTRA_FLAGS) AND (NOT DEFINED CXX_EXTRA_LINK_FLAGS)) endif () # include our macros -include(register_models.cmake) +include(cmake/register_models.cmake) # register out models -register_model(OMP OMP OMPStream.cpp) -register_model(OCL OCL OCLStream.cpp) -register_model(STD STD STDStream.cpp) -register_model(STD20 STD20 STD20Stream.cpp) -register_model(HIP HIP HIPStream.cpp) -register_model(CUDA CUDA CUDAStream.cu) -register_model(KOKKOS KOKKOS KokkosStream.cpp) -register_model(SYCL SYCL SYCLStream.cpp) -register_model(ACC ACC ACCStream.cpp) +register_model(omp OMP OMPStream.cpp) +register_model(ocl OCL OCLStream.cpp) +register_model(std STD STDStream.cpp) +register_model(std20 STD20 STD20Stream.cpp) +register_model(hip HIP HIPStream.cpp) +register_model(cuda CUDA CUDAStream.cu) +register_model(kokkos KOKKOS KokkosStream.cpp) +register_model(sycl SYCL SYCLStream.cpp) +register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) +register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA -register_model(RAJA USE_RAJA RAJAStream.cpp) -register_model(TBB TBB TBBStream.cpp) +register_model(raja USE_RAJA RAJAStream.cpp) +register_model(tbb TBB TBBStream.cpp) +register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") @@ -148,7 +164,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F # setup common build flag defaults if there are no overrides if (NOT DEFINED ${BUILD_TYPE}_FLAGS) set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS}) - elseif() +elseif () set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS}) endif () @@ -170,7 +186,8 @@ message(STATUS "Executable : ${EXE_NAME}") # below we have all the usual CMake target setup steps -add_executable(${EXE_NAME} ${IMPL_SOURCES} main.cpp) +include_directories(src) +add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) @@ -185,9 +202,9 @@ target_link_options(${EXE_NAME} PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS}) target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) # some models require the target to be already specified so they can finish their setup here -# this only happens if the MODEL.cmake definition contains the `setup_target` macro +# this only happens if the model.cmake definition contains the `setup_target` macro if (COMMAND setup_target) setup_target(${EXE_NAME}) endif () -install (TARGETS ${EXE_NAME} DESTINATION bin) \ No newline at end of file +install(TARGETS ${EXE_NAME} DESTINATION bin) diff --git a/CUDA.make b/CUDA.make deleted file mode 100644 index 90aa77c..0000000 --- a/CUDA.make +++ /dev/null @@ -1,40 +0,0 @@ -CXXFLAGS=-O3 -CUDA_CXX=nvcc - - -ifndef NVARCH -define nvarch_help -Set NVARCH to select sm_?? version. -Default: sm_60 - -endef -$(info $(nvarch_help)) -NVARCH=sm_60 -endif - - -ifndef MEM -define mem_help -Set MEM to select memory mode. -Available options: - DEFAULT - allocate host and device memory pointers. - MANAGED - use CUDA Managed Memory. - PAGEFAULT - shared memory, only host pointers allocated. - -endef -$(info $(mem_help)) -MEM=DEFAULT -endif - -MEM_MANAGED= -DMANAGED -MEM_PAGEFAULT= -DPAGEFAULT -MEM_MODE = $(MEM_$(MEM)) - - -cuda-stream: main.cpp CUDAStream.cu - $(CUDA_CXX) -std=c++11 $(CXXFLAGS) -arch=$(NVARCH) $(MEM_MODE) -DCUDA $^ $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f cuda-stream - diff --git a/HIP.make b/HIP.make deleted file mode 100644 index 7a1196f..0000000 --- a/HIP.make +++ /dev/null @@ -1,11 +0,0 @@ - -HIP_PATH?= /opt/rocm/hip -HIPCC=$(HIP_PATH)/bin/hipcc - -hip-stream: main.cpp HIPStream.cpp - $(HIPCC) $(CXXFLAGS) -O3 -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f hip-stream - diff --git a/Kokkos.make b/Kokkos.make deleted file mode 100644 index 7dd6af8..0000000 --- a/Kokkos.make +++ /dev/null @@ -1,98 +0,0 @@ - -default: kokkos-stream - -ifndef DEVICE -define device_help -Set DEVICE to change flags (defaulting to OpenMP). -Available devices are: - OpenMP, Serial, Pthreads, Cuda, HIP - -endef -$(info $(device_help)) -DEVICE="OpenMP" -endif -KOKKOS_DEVICES="$(DEVICE)" - -ifndef ARCH -define arch_help -Set ARCH to change flags (defaulting to empty). -Available architectures are: - AMDAVX - ARMv80 ARMv81 ARMv8-ThunderX - BGQ Power7 Power8 Power9 - WSM SNB HSW BDW SKX KNC KNL - Kepler30 Kepler32 Kepler35 Kepler37 - Maxwell50 Maxwell52 Maxwell53 - Pascal60 Pascal61 - Volta70 Volta72 - -endef -$(info $(arch_help)) -ARCH="" -endif -KOKKOS_ARCH="$(ARCH)" - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - GNU INTEL CRAY PGI ARMCLANG HIPCC - - Note: you may have to do `export CXX=\path\to\hipcc` in case Kokkos detects the wrong compiler - -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - -COMPILER_ARMCLANG = armclang++ -COMPILER_HIPCC = hipcc -COMPILER_GNU = g++ -COMPILER_INTEL = icpc -qopt-streaming-stores=always -COMPILER_CRAY = CC -COMPILER_PGI = pgc++ -CXX = $(COMPILER_$(COMPILER)) - -ifndef TARGET -define target_help -Set TARGET to change to offload device. Defaulting to CPU. -Available targets are: - CPU (default) - GPU - -endef -$(info $(target_help)) -TARGET=CPU -endif - -ifeq ($(TARGET), GPU) -ifneq ($(COMPILER), HIPCC) -CXX = $(NVCC_WRAPPER) -endif -endif - -OBJ = main.o KokkosStream.o -CXXFLAGS = -O3 -LINKFLAGS = # empty for now - - - -ifeq ($(COMPILER), GNU) -ifeq ($(DEVICE), OpenMP) -CXXFLAGS += -fopenmp -LINKFLAGS += -fopenmp -endif -endif - -include $(KOKKOS_PATH)/Makefile.kokkos - -kokkos-stream: $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(CXX) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -DKOKKOS -o $@ - -%.o: %.cpp $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -DKOKKOS -c $< - -.PHONY: clean -clean: - rm -f kokkos-stream main.o KokkosStream.o Kokkos_*.o - diff --git a/OpenACC.make b/OpenACC.make deleted file mode 100644 index 7a75fd0..0000000 --- a/OpenACC.make +++ /dev/null @@ -1,58 +0,0 @@ - -ifndef COMPILER -define compiler_help -Set COMPILER to ensure correct flags are set. -Available compilers are: - PGI GNU -endef -$(info $(compiler_help)) -endif - -COMPILER_ = $(CXX) -COMPILER_PGI = pgc++ -COMPILER_GNU = g++ - -FLAGS_ = -O3 -std=c++11 - -FLAGS_PGI = -std=c++11 -O3 -acc -ifeq ($(COMPILER), PGI) -define target_help -Set a TARGET to ensure PGI targets the correct offload device. -Available targets are: - SNB, IVB, HSW, SKL, KNL - PWR9, AMD - KEPLER, MAXWELL, PASCAL, VOLTA - HAWAII -endef -ifndef TARGET -$(error $(target_help)) -endif -TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge -TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge -TARGET_FLAGS_HSW = -ta=multicore -tp=haswell -TARGET_FLAGS_SKL = -ta=multicore -tp=skylake -TARGET_FLAGS_KNL = -ta=multicore -tp=knl -TARGET_FLAGS_PWR9 = -ta=multicore -tp=pwr9 -TARGET_FLAGS_AMD = -ta=multicore -tp=zen -TARGET_FLAGS_KEPLER = -ta=nvidia:cc35 -TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50 -TARGET_FLAGS_PASCAL = -ta=nvidia:cc60 -TARGET_FLAGS_VOLTA = -ta=nvidia:cc70 -TARGET_FLAGS_HAWAII = -ta=radeon:hawaii -ifeq ($(TARGET_FLAGS_$(TARGET)),) -$(error $(target_help)) -endif - -FLAGS_PGI += $(TARGET_FLAGS_$(TARGET)) - -endif - -FLAGS_GNU = -O3 -std=c++11 -Drestrict=__restrict -fopenacc -CXXFLAGS = $(FLAGS_$(COMPILER)) - -acc-stream: main.cpp ACCStream.cpp - $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f acc-stream main.o ACCStream.o diff --git a/OpenCL.make b/OpenCL.make deleted file mode 100644 index 8ad7108..0000000 --- a/OpenCL.make +++ /dev/null @@ -1,39 +0,0 @@ - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - GNU CLANG INTEL CRAY - -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - -COMPILER_GNU = g++ -COMPILER_CLANG = clang++ -COMPILER_INTEL = icpc -COMPILER_CRAY = CC -CXX = $(COMPILER_$(COMPILER)) - -FLAGS_ = -O3 -std=c++11 -FLAGS_GNU = -O3 -std=c++11 -FLAGS_CLANG = -O3 -std=c++11 -FLAGS_INTEL = -O3 -std=c++11 -FLAGS_CRAY = -O3 -hstd=c++11 -CXXFLAGS=$(FLAGS_$(COMPILER)) - -PLATFORM = $(shell uname -s) -ifeq ($(PLATFORM), Darwin) - LIBS = -framework OpenCL -else - LIBS = -lOpenCL -endif - -ocl-stream: main.cpp OCLStream.cpp - $(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@ - -.PHONY: clean -clean: - rm -f ocl-stream - diff --git a/OpenMP.make b/OpenMP.make deleted file mode 100644 index e81d252..0000000 --- a/OpenMP.make +++ /dev/null @@ -1,103 +0,0 @@ - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - CLANG CRAY GNU GNU_PPC INTEL XL PGI - NEC ARMCLANG AOMP FUJITSU - -Note: GCC on PPC requires -mcpu=native instead of -march=native so we have a special case for it - -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - -ifndef TARGET -define target_help -Set TARGET to change device (defaulting to CPU). -Available targets are: - CPU NVIDIA AMD INTEL_GPU - -endef -$(info $(target_help)) -TARGET=CPU -endif - -ifeq ("$(COMPILER)", "CLANG") - ifdef TARGET - ifeq ("$(TARGET)", "NVIDIA") - ifndef NVARCH - define nvarch_help - Set NVARCH to select sm_?? version. - Default: sm_60 - - endef - $(info $(nvarch_help)) - NVARCH=sm_60 - endif - endif - endif -endif - -COMPILER_ARMCLANG = armclang++ -COMPILER_GNU = g++ -COMPILER_GNU_PPC = g++ -COMPILER_INTEL = icpc -COMPILER_CRAY = CC -COMPILER_CLANG = clang++ -COMPILER_XL = xlc++ -COMPILER_PGI = pgc++ -COMPILER_NEC = /opt/nec/ve/bin/nc++ -COMPILER_AOMP = clang++ -COMPILER_FUJITSU=FCC -CXX = $(COMPILER_$(COMPILER)) - -FLAGS_GNU = -O3 -std=c++11 -march=native -FLAGS_GNU_PPC = -O3 -std=c++11 -mcpu=native -FLAGS_INTEL = -O3 -std=c++11 -FLAGS_CRAY = -O3 -std=c++11 -FLAGS_CLANG = -O3 -std=c++11 -FLAGS_XL = -O5 -qarch=auto -qtune=auto -std=c++11 -FLAGS_PGI = -O3 -std=c++11 -FLAGS_NEC = -O4 -finline -std=c++11 -FLAGS_ARMCLANG = -O3 -std=c++11 -FLAGS_AOMP = -O3 -std=c++11 -FLAGS_FUJITSU=-Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -CXXFLAGS = $(FLAGS_$(COMPILER)) - -# OpenMP flags for CPUs -OMP_ARMCLANG_CPU = -fopenmp -OMP_GNU_CPU = -fopenmp -OMP_GNU_PPC_CPU = -fopenmp -OMP_INTEL_CPU = -qopenmp -OMP_CRAY_CPU = -fopenmp -OMP_CLANG_CPU = -fopenmp=libomp -OMP_XL_CPU = -qsmp=omp -qthreaded -OMP_PGI_CPU = -mp -OMP_NEC_CPU = -fopenmp -OMP_FUJITSU_CPU=-Kopenmp - -# OpenMP flags for NVIDIA -OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU -OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(NVARCH) -OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none -OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa - -OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always -qopenmp -OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64 - -OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 - -ifndef OMP_$(COMPILER)_$(TARGET) -$(error Targeting $(TARGET) with $(COMPILER) not supported) -endif - -OMP = $(OMP_$(COMPILER)_$(TARGET)) - -omp-stream: main.cpp OMPStream.cpp - $(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f omp-stream diff --git a/RAJA.make b/RAJA.make deleted file mode 100644 index 47aeefb..0000000 --- a/RAJA.make +++ /dev/null @@ -1,58 +0,0 @@ - -ifndef TARGET -define target_help -Set TARGET to change to offload device. Defaulting to CPU. -Available targets are: - CPU (default) - GPU -endef -$(info $(target_help)) -TARGET=CPU -endif - -ifeq ($(TARGET), CPU) - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - INTEL GNU CRAY XL -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - -CXX_INTEL = icpc -CXX_GNU = g++ -CXX_CRAY = CC -CXX_XL = xlc++ - -CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always -CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp -CXXFLAGS_CRAY = -O3 -hstd=c++11 -CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded - -CXX = $(CXX_$(COMPILER)) -CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER)) - -else ifeq ($(TARGET), GPU) -CXX = nvcc - -ifndef ARCH -define arch_help -Set ARCH to ensure correct GPU architecture. -Example: - ARCH=sm_35 -endef -$(error $(arch_help)) -endif -CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH) -endif - -raja-stream: main.cpp RAJAStream.cpp - $(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ - -.PHONY: clean -clean: - rm -f raja-stream - diff --git a/README.android b/README.android deleted file mode 100644 index edc4a52..0000000 --- a/README.android +++ /dev/null @@ -1,36 +0,0 @@ -Android (outdated instructions) ------------------- - -Assuming you have a recent Android NDK available, you can use the -toolchain that it provides to build GPU-STREAM. You should first -use the NDK to generate a standalone toolchain: - - # Select a directory to install the toolchain to - ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain - - ${NDK}/build/tools/make-standalone-toolchain.sh \ - --platform=android-14 \ - --toolchain=arm-linux-androideabi-4.8 \ - --install-dir=${ANDROID_NATIVE_TOOLCHAIN} - -Make sure that the OpenCL headers and library (libOpenCL.so) are -available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`. - -You should then be able to build GPU-STREAM: - - make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++ - -Copy the executable and OpenCL kernels to the device: - - adb push gpu-stream-ocl /data/local/tmp - adb push ocl-stream-kernels.cl /data/local/tmp - -Run GPU-STREAM from an adb shell: - - adb shell - cd /data/local/tmp - - # Use float if device doesn't support double, and reduce array size - ./gpu-stream-ocl --float -n 6 -s 10000000 - - diff --git a/README.md b/README.md index f9a4415..df95582 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,53 @@ -BabelStream -========== +# BabelStream logo +[![CI](https://github.com/UoB-HPC/BabelStream/actions/workflows/main.yaml/badge.svg?branch=main)](https://github.com/UoB-HPC/BabelStream/actions/workflows/main.yaml) Measure memory transfer rates to/from global device memory on GPUs. This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for CPUs. Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time. -There are multiple implementations of this benchmark in a variety of programming models. -Currently implemented are: - - OpenCL - - CUDA - - OpenACC - - OpenMP 3 and 4.5 - - C++ Parallel STL - - Kokkos - - RAJA - - SYCL - - TBB +There are multiple implementations of this benchmark in a variety of [programming models](#models). This code was previously called GPU-STREAM. +## Table of Contents +- [Programming Models](#programming-models) +- [How is this different to STREAM?](#how-is-this-different-to-stream) +- [Building](#building) + - [CMake](#cmake) + - [GNU Make (removed)](#gnu-make) +- [Results](#results) +- [Contributing](#contributing) +- [Citing](#citing) + - [Other BabelStream publications](#other-babelstream-publications) -How is this different to STREAM? --------------------------------- + +## Programming Models + +BabelStream is currently implemented in the following parallel programming models, listed in no particular order: + +- OpenCL +- CUDA +- HIP +- OpenACC +- OpenMP 3 and 4.5 +- C++ Parallel STL +- Kokkos +- RAJA +- SYCL and SYCL 2020 +- TBB +- Thrust (via CUDA or HIP) + +This project also contains implementations in alternative languages with different build systems: +* Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl) +* Java - [java-stream](./src/java/java-stream) +* Scala - [scala-stream](./src/scala/scala-stream) +* Rust - [rust-stream](./src/rust/rust-stream) + +## How is this different to STREAM? BabelStream implements the four main kernels of the STREAM benchmark (along with a dot product), but by utilising different programming models expands the platforms which the code can run beyond CPUs. @@ -42,34 +64,46 @@ BabelStream therefore provides a measure of what memory bandwidth performance ca BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels). Details about PRK can be found in the following references: -> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014. +* Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014. -> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17). +* R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17). -> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192). +* Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192). -Website -------- -[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/) - -Usage ------ +## Building Drivers, compiler and software applicable to whichever implementation you would like to build against is required. ### CMake -The project supports building with CMake >= 3.13.0, it can be installed without root via the [official script](https://cmake.org/download/). -As with any CMake project, first configure the project: +The project supports building with CMake >= 3.13.0, which can be installed without root via the [official script](https://cmake.org/download/). + +Each BabelStream implementation (programming model) is built as follows: ```shell -> cd babelstream -> cmake -Bbuild -H. -DMODEL= # configure the build, build type defaults to Release -> cmake --build build # compile it -> ./build/babelstream # executable available at ./build/ +$ cd babelstream + +# configure the build, build type defaults to Release +# The -DMODEL flag is required +$ cmake -Bbuild -H. -DMODEL= + +# compile +$ cmake --build build + +# run executables in ./build +$ ./build/-stream ``` +The `MODEL` option selects one implementation of BabelStream to build. +The source for each model's implementations are located in `./src/`. + +Currently available models are: +``` +omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +``` + +#### Overriding default flags By default, we have defined a set of optimal flags for known HPC compilers. There are assigned those to `RELEASE_FLAGS`, and you can override them if required. @@ -77,7 +111,7 @@ To find out what flag each model supports or requires, simply configure while on For example: ```shell > cd babelstream -> cmake -Bbuild -H. -DMODEL=OCL +> cmake -Bbuild -H. -DMODEL=ocl ... - Common Release flags are `-O3`, set RELEASE_FLAGS to override -- CXX_EXTRA_FLAGS: @@ -91,90 +125,60 @@ For example: Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`) -- CXX_EXTRA_LINKER_FLAGS: Append to linker flags (i.e GCC's `-Wl` or equivalent) --- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB --- Selected model : OCL +-- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja;tbb +-- Selected model : ocl -- Supported flags: CMAKE_CXX_COMPILER (optional, default=c++): Any CXX compiler that is supported by CMake detection OpenCL_LIBRARY (optional, default=): Path to OpenCL library, usually called libOpenCL.so ... ``` -Alternatively, refer to the [CI script](./ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there. -*It is recommended that you delete the `build` directory when you change any of the build flags.* +Alternatively, refer to the [CI script](./src/ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there. + +*It is recommended that you delete the `build` directory when you change any of the build flags.* ### GNU Make -We have supplied a series of Makefiles, one for each programming model, to assist with building. -The Makefiles contain common build options, and should be simple to customise for your needs too. +Support for Make has been removed from 4.0 onwards. +However, as the build process only involves a few source files, the required compile commands can be extracted from the CI output. -General usage is `make -f .make` -Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`. -Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`. + -Pass in extra flags via the `EXTRA_FLAGS` option. +## Results -The binaries are named in the form `-stream`. +Sample results can be found in the `results` subdirectory. +Newer results are found in our [Performance Portability](https://github.com/UoB-HPC/performance-portability) repository. -#### Building Kokkos for Make -Kokkos version >= 3 requires setting the `KOKKOS_PATH` flag to the *source* directory of a distribution. -For example: - -``` -cd -wget https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz -tar -xvf 3.1.01.tar.gz # should end up with ~/kokkos-3.1.01 -cd BabelStream -make -f Kokkos.make KOKKOS_PATH=~/kokkos-3.1.01 -``` -See make output for more information on supported flags. - -#### Building RAJA for Make - -We use the following command to build RAJA using the Intel Compiler. -``` -cmake .. -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off -``` -For building with CUDA support, we use the following command. -``` -cmake .. -DCMAKE_INSTALL_PREFIX= -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off -``` - -Results -------- - -Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request. - -Contributing ------------- +## Contributing As of v4.0, the `main` branch of this repository will hold the latest released version. The `develop` branch will contain unreleased features due for the next (major and/or minor) release of BabelStream. Pull Requests should be made against the `develop` branch. -Citing ------- +## Citing + Please cite BabelStream via this reference: -> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 +Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 -**Other BabelStream publications:** +### Other BabelStream publications -> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 +* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 -> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. -You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). +* Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. + You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). -> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. -You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). +* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. + You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). -> Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017. +* Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017. -> Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States. -You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html) +* Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States. + You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html) [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995. diff --git a/STD.make b/STD.make deleted file mode 100644 index 3225a08..0000000 --- a/STD.make +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# For full license terms please see the LICENSE file distributed with this -# source code - -CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD -STD_CXX=nvc++ - -std-stream: main.cpp STDStream.cpp - $(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f std-stream diff --git a/STD20.make b/STD20.make deleted file mode 100644 index eced9f7..0000000 --- a/STD20.make +++ /dev/null @@ -1,26 +0,0 @@ - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - GNU - -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - -COMPILER_GNU = g++ -CXX = $(COMPILER_$(COMPILER)) - -FLAGS_GNU = -O3 -std=c++2a -march=native -CXXFLAGS = $(FLAGS_$(COMPILER)) - - -std20-stream: main.cpp STD20Stream.cpp - $(CXX) -DSTD20 $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f std20-stream - diff --git a/SYCL.make b/SYCL.make deleted file mode 100644 index 58df8d0..0000000 --- a/SYCL.make +++ /dev/null @@ -1,81 +0,0 @@ -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - HIPSYCL, DPCPP, COMPUTECPP - - - For HIPSYCL and COMPUTECPP, SYCL_SDK_DIR must be specified, the directory should contain [/lib, /bin, ...] - For DPCPP, the compiler must be on path -endef -$(info $(compiler_help)) -COMPILER=HIPSYCL -endif - -ifndef TARGET -define target_help -Set TARGET to change device (defaulting to CPU). -Available targets are: - CPU AMD NVIDIA - -endef -$(info $(target_help)) -TARGET=CPU -endif - - -ifndef ARCH -define arch_help -Set ARCH to change device (defaulting to ""). -(GPU *only*) Available targets for HIPSYCL are: - For CUDA, the architecture has the form sm_XX, e.g. sm_60 for Pascal. - For ROCm, the architecture has the form gfxYYY, e.g. gfx900 for Vega 10, gfx906 for Vega 20. - -endef - -ifeq ($(COMPILER), HIPSYCL) -ifneq ($(TARGET), CPU) -$(info $(arch_help)) -ARCH= -endif -endif - -endif - -SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) -no-serial-memop -sycl-driver -SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS) -SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS) -SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64 -SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++ -SYCL_COMPUTECPP_FLAGS = -O3 -std=c++17 -SYCL_COMPUTECPP_LINK_FLAGS = -Wl,-rpath=$(SYCL_SDK_DIR)/lib/ $(SYCL_SDK_DIR)/lib/libComputeCpp.so -lOpenCL -SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include - -SYCL_HIPSYCL_SYCLFLAGS_CPU = --hipsycl-platform=cpu -SYCL_HIPSYCL_SYCLFLAGS_AMD = --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH) -SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH) -SYCL_HIPSYCL_SYCLCXX = $(SYCL_SDK_DIR)/bin/syclcc -SYCL_HIPSYCL_FLAGS = -O3 --std=c++17 -SYCL_HIPSYCL_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -Wl,-rpath,$(SYCL_SDK_DIR)/lib -SYCL_HIPSYCL_INCLUDE = - -SYCL_DPCPP_SYCLFLAGS_NVIDIA = -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda -SYCL_DPCPP_SYCLCXX = dpcpp -SYCL_DPCPP_FLAGS = -O3 --std=c++17 -SYCL_DPCPP_LINK_FLAGS = -SYCL_DPCPP_INCLUDE = - - -SYCL_SYCLFLAGS = $(SYCL_$(COMPILER)_SYCLFLAGS_$(TARGET)) -SYCL_SYCLCXX = $(SYCL_$(COMPILER)_SYCLCXX) -SYCL_FLAGS = $(SYCL_$(COMPILER)_FLAGS) -SYCL_LINK_FLAGS = $(SYCL_$(COMPILER)_LINK_FLAGS) -SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE) - -# only ComputeCpp generates .sycl files which is a bit odd to deal with so we opted to compile everything together -sycl-stream: main.cpp SYCLStream.cpp - $(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@ - -.PHONY: clean -clean: - rm -f sycl-stream diff --git a/TBB.make b/TBB.make deleted file mode 100644 index c224a5a..0000000 --- a/TBB.make +++ /dev/null @@ -1,56 +0,0 @@ - -ifndef COMPILER -define compiler_help -Set COMPILER to change flags (defaulting to GNU). -Available compilers are: - GNU INTEL INTEL_LEGACY - -endef -$(info $(compiler_help)) -COMPILER=GNU -endif - - -CXX_GNU = g++ -CXX_INTEL = icpx -CXX_INTEL_LEGACY = icpc -CXX = $(COMPILER_$(COMPILER)) - -CXXFLAGS_GNU = -march=native -CXXFLAGS_INTEL = -march=native -CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always - -CXX = $(CXX_$(COMPILER)) -CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) - - - -ifndef PARTITIONER -define partitioner_help -Set PARTITIONER to select TBB's partitioner. -Partitioner specifies how a loop template should partition its work among threads. - -Available options: - AUTO - Optimize range subdivision based on work-stealing events. - AFFINITY - Proportional splitting that optimizes for cache affinity. - STATIC - Distribute work uniformly with no additional load balancing. - SIMPLE - Recursively split its range until it cannot be further subdivided. - -See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners -for more details. - -endef -$(info $(partitioner_help)) -PARTITIONER=AUTO -endif - -PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) - - -tbb-stream: main.cpp TBBStream.cpp - $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ - -.PHONY: clean -clean: - rm -f tbb-stream - diff --git a/register_models.cmake b/cmake/register_models.cmake similarity index 93% rename from register_models.cmake rename to cmake/register_models.cmake index 9f4cb57..f180c03 100644 --- a/register_models.cmake +++ b/cmake/register_models.cmake @@ -131,22 +131,24 @@ endfunction() macro(register_model NAME PREPROCESSOR_NAME) - string(TOUPPER ${NAME} MODEL_UPPER) list(APPEND REGISTERED_MODELS "${NAME}") - list(APPEND IMPL_${MODEL_UPPER}_SOURCES "${ARGN}") + string(TOUPPER ${NAME} MODEL_UPPER) + list(APPEND IMPL_${MODEL_UPPER}_SOURCES "src/${NAME}/${ARGN}") list(APPEND IMPL_${MODEL_UPPER}_DEFINITIONS "${PREPROCESSOR_NAME}") endmacro() macro(load_model MODEL) - string(TOUPPER "${MODEL}" MODEL_UPPER) - if ("${MODEL_UPPER}" IN_LIST REGISTERED_MODELS) - set(MODEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${MODEL_UPPER}.cmake) + if ("${MODEL}" IN_LIST REGISTERED_MODELS) + string(TOLOWER "${MODEL}" MODEL_LOWER) + set(MODEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/${MODEL_LOWER}/model.cmake) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/${MODEL_LOWER}) if (NOT EXISTS ${MODEL_FILE}) message(FATAL_ERROR "${MODEL_FILE} not found, perhaps it needs to be implemented?") endif () include(${MODEL_FILE}) + string(TOUPPER "${MODEL}" MODEL_UPPER) list(APPEND IMPL_SOURCES ${IMPL_${MODEL_UPPER}_SOURCES}) list(APPEND IMPL_DEFINITIONS ${IMPL_${MODEL_UPPER}_DEFINITIONS}) diff --git a/legacy/HC.make b/legacy/HC.make deleted file mode 100644 index b902ada..0000000 --- a/legacy/HC.make +++ /dev/null @@ -1,21 +0,0 @@ - -HCC = hcc - -CXXFLAGS+=-O3 $(shell hcc-config --cxxflags) -LDFLAGS+=$(shell hcc-config --ldflags) - -ifdef TBSIZE -CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE) -endif - -ifdef NTILES -CXXFLAGS+=-DNTILES=$(TBSIZE) -endif - - -hc-stream: ../main.cpp HCStream.cpp - $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ - -.PHONY: clean -clean: - rm -f hc-stream diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..568a953 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,29 @@ + +**/cuda-stream +**/ocl-stream +**/omp-stream +**/acc-stream +**/raja-stream +**/kokkos-stream +**/std-stream +**/sycl-stream +**/hip-stream + +**/*.o +**/*.bc +**/*.sycl +**/*.tar +**/*.gz +**/*.a + +**/KokkosCore_Config_* + +**/.DS_Store + + +build/ +cmake-build-*/ +CMakeFiles/ +.idea/ +.vscode/ +.directory \ No newline at end of file diff --git a/Stream.h b/src/Stream.h similarity index 100% rename from Stream.h rename to src/Stream.h diff --git a/ACCStream.cpp b/src/acc/ACCStream.cpp similarity index 100% rename from ACCStream.cpp rename to src/acc/ACCStream.cpp diff --git a/ACCStream.h b/src/acc/ACCStream.h similarity index 100% rename from ACCStream.h rename to src/acc/ACCStream.h diff --git a/ACC.cmake b/src/acc/model.cmake similarity index 100% rename from ACC.cmake rename to src/acc/model.cmake diff --git a/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh similarity index 90% rename from ci-prepare-bionic.sh rename to src/ci-prepare-bionic.sh index fa3b2d2..656d338 100755 --- a/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -135,17 +135,20 @@ setup_aocc() { setup_nvhpc() { echo "Preparing Nvidia HPC SDK" local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz" +# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_215_Linux_x86_64_cuda_11.3/install_components/Linux_x86_64/21.5" + local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" local bin_dir="$sdk_dir/compilers/bin" "$bin_dir/makelocalrc" "$bin_dir" -x + export_var NVHPC_SDK_DIR "$sdk_dir" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.3/bin/nvcc" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.3" + export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + echo "Installed CUDA versions:" ls "$sdk_dir/cuda" verify_bin_exists "$NVHPC_NVCXX" @@ -222,10 +225,7 @@ setup_tbb() { setup_clang_gcc() { - echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list - - sudo apt-get update -qq - sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 + sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev export_var GCC_CXX "$(which g++-10)" verify_bin_exists "$GCC_CXX" @@ -248,11 +248,9 @@ setup_clang_gcc() { } setup_rocm() { - wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list - sudo apt-get update -qq - sudo apt-get install -y -qq rocm-dev + sudo apt-get install -y -qq rocm-dev rocthrust-dev export_var ROCM_PATH "/opt/rocm" + export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" verify_bin_exists "$HIP_CXX" "$HIP_CXX" --version @@ -316,9 +314,21 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then echo "Running in GitHub Actions, defaulting to special export" TERM=xterm export TERM=xterm + + # drop the lock in case we got one from a failed run + rm /var/lib/dpkg/lock-frontend || true + rm /var/cache/apt/archives/lock || true + + wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - + echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list + echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list + + sudo apt-get update -qq + sudo apt-get install -y -qq cmake + if [ "$SETUP" = true ]; then - echo "Deleting extra packages for space in 5 seconds..." - sleep 5 + echo "Deleting extra packages for space in 2 seconds..." + sleep 2 echo "Starting apt-get remove:" sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel sudo apt-get autoremove -y diff --git a/ci-test-compile.sh b/src/ci-test-compile.sh similarity index 50% rename from ci-test-compile.sh rename to src/ci-test-compile.sh index 00ca718..3f54aaf 100755 --- a/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -57,7 +57,7 @@ run_build() { local cmake_code=$? "$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log" - "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" + "$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log" local cmake_code=$? set -e @@ -88,32 +88,30 @@ run_build() { ### # KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00" # RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0" - -# GCC_CXX="/usr/bin/g++" +# +# GCC_CXX="$(which g++-10)" # CLANG_CXX="/usr/bin/clang++" - -# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/" -# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++" -# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc" -# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2" -# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x - +# +# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/" +# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++" +# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.4/bin/nvcc" +# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.4" +# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x +# # AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++" # AOMP_CXX="/usr/lib/aomp/bin/clang++" # OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so" - +# # # AMD needs this rocm_path thing exported... -# export ROCM_PATH="/opt/rocm-4.0.0" -# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc" -# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu" +# export ROCM_PATH="/opt/rocm-4.5.0" +# HIP_CXX="/opt/rocm-4.5.0/bin/hipcc" +# COMPUTECPP_DIR="/home/tom/Downloads/ComputeCpp-CE-2.7.0-x86_64-linux-gnu/" # DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler" # HIPSYCL_DIR="/opt/hipsycl/cff515c/" - -# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx" -# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc" - -# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" - +# +# ICPX_CXX="/opt/intel/oneapi/compiler/2021.4.0/linux/bin/icpx" +# ICPC_CXX="/opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/icpc"# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/" +# # GCC_STD_PAR_LIB="tbb" # CLANG_STD_PAR_LIB="tbb" # GCC_OMP_OFFLOAD_AMD=false @@ -124,102 +122,136 @@ run_build() { AMD_ARCH="gfx_903" NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.3,cc80" +NV_ARCH_CCXY="cuda11.4,cc80" build_gcc() { local name="gcc_build" local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}" - run_build $name "${GCC_CXX:?}" OMP "$cxx" + run_build $name "${GCC_CXX:?}" omp "$cxx" if [ "$MODEL" = "all" ] || [ "$MODEL" = "OMP" ]; then # sanity check that it at least runs - echo "Sanity checking GCC OMP build..." - "./$BUILD_DIR/OMP_$name/omp-stream" -s 1048576 -n 10 + echo "Sanity checking GCC omp build..." + "./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10 fi # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" - run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then - run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" - run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" + run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" + run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" fi if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then - run_build "nvidia_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" - run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" + run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" + run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" - run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" - run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - # run_build $name "${CC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" - run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" - run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" + run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" + run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" + # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" + run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" + run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 # FIXME we also got https://github.com/NVIDIA/nccl/issues/494 -# run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ +# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ # -DENABLE_CUDA=ON \ # -DTARGET=NVIDIA \ # -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ # -DCUDA_ARCH=$NV_ARCH" + + # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required="3.15.0" + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + + # FIXME CUDA Thrust + TBB throws the following error: + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined + # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined + + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + else + echo "CMake version ${current} < ${required}, skipping Thrust models" + fi + } build_clang() { local name="clang_build" local cxx="-DCMAKE_CXX_COMPILER=${CLANG_CXX:?}" - run_build $name "${CLANG_CXX:?}" OMP "$cxx" + run_build $name "${CLANG_CXX:?}" omp "$cxx" if [ "${CLANG_OMP_OFFLOAD_AMD:-false}" != "false" ]; then - run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH" + run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" fi if [ "${CLANG_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then - run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" + run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" - run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" - run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" - run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - - run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB" - run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" + run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" + run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" + run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" + # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" + run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc } build_nvhpc() { local name="nvhpc_build" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" - run_build $name "${NVHPC_NVCXX:?}" STD "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" - run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } build_aocc() { - run_build aocc_build "${AOCC_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}" + run_build aocc_build "${AOCC_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}" } build_aomp() { - run_build aomp_amd_build "${AOMP_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=AMD:gfx906" + run_build aomp_amd_build "${AOMP_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=AMD:gfx906" #run_build aomp_nvidia_build "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=NVIDIA:$NV_ARCH" } build_hip() { - run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + local name="hip_build" + + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + + run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } build_icpx() { @@ -227,7 +259,7 @@ build_icpx() { set +u source /opt/intel/oneapi/setvars.sh -force || true set -u - run_build intel_build "${ICPX_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${ICPX_CXX:?} -DOFFLOAD=INTEL" + run_build intel_build "${ICPX_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${ICPX_CXX:?} -DOFFLOAD=INTEL" } build_icpc() { @@ -237,31 +269,31 @@ build_icpc() { set -u local name="intel_build" local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" - run_build $name "${ICPC_CXX:?}" OMP "$cxx" - run_build $name "${ICPC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${ICPC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build $name "${ICPC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + run_build $name "${ICPC_CXX:?}" omp "$cxx" + run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" + run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" } build_computecpp() { - run_build computecpp_build "compute++" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ + run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ -DSYCL_COMPILER=COMPUTECPP \ -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \ -DOpenCL_LIBRARY=${OCL_LIB:?}" } build_dpcpp() { - run_build intel_build "${DPCPP_DIR:?}" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ + run_build intel_build "${DPCPP_DIR:?}" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ -DSYCL_COMPILER=DPCPP \ -DSYCL_COMPILER_DIR=${DPCPP_DIR:?}" # for oneAPI BaseKit: # source /opt/intel/oneapi/setvars.sh -force - # run_build intel_build "dpcpp" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP" + # run_build intel_build "dpcpp" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP" } build_hipsycl() { - run_build hipsycl_build "syclcc" SYCL " + run_build hipsycl_build "syclcc" sycl " -DSYCL_COMPILER=HIPSYCL \ -DSYCL_COMPILER_DIR=${HIPSYCL_DIR:?}" } diff --git a/CUDAStream.cu b/src/cuda/CUDAStream.cu similarity index 100% rename from CUDAStream.cu rename to src/cuda/CUDAStream.cu diff --git a/CUDAStream.h b/src/cuda/CUDAStream.h similarity index 100% rename from CUDAStream.h rename to src/cuda/CUDAStream.h diff --git a/CUDA.cmake b/src/cuda/model.cmake similarity index 100% rename from CUDA.cmake rename to src/cuda/model.cmake diff --git a/HIPStream.cpp b/src/hip/HIPStream.cpp similarity index 100% rename from HIPStream.cpp rename to src/hip/HIPStream.cpp diff --git a/HIPStream.h b/src/hip/HIPStream.h similarity index 100% rename from HIPStream.h rename to src/hip/HIPStream.h diff --git a/HIP.cmake b/src/hip/model.cmake similarity index 100% rename from HIP.cmake rename to src/hip/model.cmake diff --git a/src/java/java-stream/.gitignore b/src/java/java-stream/.gitignore new file mode 100644 index 0000000..2ed994a --- /dev/null +++ b/src/java/java-stream/.gitignore @@ -0,0 +1,128 @@ +## File-based project format: +.idea +*.iws +*.iml + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk +### Maven template +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties + +# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) +!/.mvn/wrapper/maven-wrapper.jar +### Java template +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* +### macOS template +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +!.mvn/**/* + +settings.xml diff --git a/src/java/java-stream/.mvn/wrapper/maven-wrapper.jar b/src/java/java-stream/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..9cc84ea Binary files /dev/null and b/src/java/java-stream/.mvn/wrapper/maven-wrapper.jar differ diff --git a/src/java/java-stream/.mvn/wrapper/maven-wrapper.properties b/src/java/java-stream/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..56bb016 --- /dev/null +++ b/src/java/java-stream/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1 @@ +distributionUrl=https://repo1.maven.org/maven2/org/apache/maven/apache-maven/3.5.0/apache-maven-3.5.0-bin.zip \ No newline at end of file diff --git a/src/java/java-stream/README.md b/src/java/java-stream/README.md new file mode 100644 index 0000000..6c233da --- /dev/null +++ b/src/java/java-stream/README.md @@ -0,0 +1,172 @@ +java-stream +=========== + +This is an implementation of BabelStream in Java 8 which contains the following implementations: + +* `jdk-plain` - Single threaded `for` +* `jdk-stream` - Threaded implementation using JDK8's parallel stream API +* `tornadovm` - A [TornadoVM](https://github.com/beehive-lab/TornadoVM) implementation for + PTX/OpenCL +* `aparapi` - A [Aparapi](https://git.qoto.org/aparapi/aparapi) implementation for OpenCL + +### Build & Run + +Prerequisites + +* JDK >= 8 + +To run the benchmark, first create a binary: + +```shell +> cd java-stream +> ./mvnw clean package +``` + +The binary will be located at `./target/java-stream.jar`. Run it with: + +```shell +> java -version  ✔  11.0.11+9 ☕  tom@soraws-uk  05:03:20 +openjdk version "11.0.11" 2021-04-20 +OpenJDK Runtime Environment GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05) +OpenJDK 64-Bit Server VM GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05, mixed mode) +> java -jar target/java-stream.jar --help +``` + +For best results, benchmark with the following JVM flags: + +``` +-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance +-XX:-TieredCompilation # disable C1, go straight to C2 +-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point +``` + +Worked example: + +```shell +> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/java-stream.jar +BabelStream +Version: 3.4 +Implementation: jdk-stream; (Java 11.0.11;Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-4.fc33.x86_64) +Running all 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 17145.538 0.03131 0.04779 0.03413 +Mul 16759.092 0.03203 0.04752 0.03579 +Add 19431.954 0.04144 0.05866 0.04503 +Triad 19763.970 0.04075 0.05388 0.04510 +Dot 26646.894 0.02015 0.03013 0.02259 +``` + +If your OpenCL/CUDA installation is not at the default location, TornadoVM and Aparapi may fail to +detect your devices. In those cases, you may specify the library directly, for example: + +```shell +> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 java -jar target/java-stream.jar ... +``` + +### Instructions for TornadoVM + +The TornadoVM implementation requires you to run the binary with a patched JVM. Follow the +official [instructions](https://github.com/beehive-lab/TornadoVM/blob/master/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md) +or use the following simplified instructions: + +Prerequisites + +* CMake >= 3.6 +* GCC or clang/LLVM (GCC >= 5.5) +* Python >= 2.7 +* Maven >= 3.6.3 +* OpenCL headers >= 1.2 and/or CUDA SDK >= 9.0 + +First, get a copy of the TornadoVM source: + +```shell +> cd +> git clone https://github.com/beehive-lab/TornadoVM tornadovm +``` + +Take note of the required GraalVM version +in `tornadovm/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md`. We'll use `21.1.0` in this example. +Now, obtain a copy of GraalVM and make sure the version matches the one required by TornadoVM: + +```shell +> wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-21.1.0/graalvm-ce-java11-linux-amd64-21.1.0.tar.gz +> tar -xf graalvm-ce-java11-linux-amd64-21.1.0.tar.gz +``` + +Next, create `~/tornadovm/etc/sources.env` and populate the file with the following: + +```shell +#!/bin/bash +export JAVA_HOME= +export PATH=$PWD/bin/bin:$PATH +export TORNADO_SDK=$PWD/bin/sdk +export CMAKE_ROOT=/usr # path to CMake binary +``` + +Proceed to compile TornadoVM: + +```shell +> cd ~/tornadovm +> . etc/sources.env +> make graal-jdk-11-plus BACKEND={ptx,opencl} +``` + +To test your build, source the environment file: + +```shell +> source ~/tornadovm/etc/sources.env +> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 tornado --devices +Number of Tornado drivers: 1 +Total number of OpenCL devices : 3 +Tornado device=0:0 + AMD Accelerated Parallel Processing -- gfx1012 + Global Memory Size: 4.0 GB + Local Memory Size: 64.0 KB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [1024, 1024, 1024] + Device OpenCL C version: OpenCL C 2.0 + +Tornado device=0:1 + Portable Computing Language -- pthread-AMD Ryzen 9 3900X 12-Core Processor + Global Memory Size: 60.7 GB + Local Memory Size: 8.0 MB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [4096, 4096, 4096] + Device OpenCL C version: OpenCL C 1.2 pocl + +Tornado device=0:2 + NVIDIA CUDA -- NVIDIA GeForce GT 710 + Global Memory Size: 981.3 MB + Local Memory Size: 48.0 KB + Workgroup Dimensions: 3 + Max WorkGroup Configuration: [1024, 1024, 64] + Device OpenCL C version: OpenCL C 1.2 +``` + +You can now use TornadoVM to run java-stream: + +```shell +> tornado -jar ~/java-stream/target/java-stream.jar --impl tornadovm --arraysize 65536  1 ✘  11.0.11+9 ☕  tom@soraws-uk  05:31:34 +BabelStream +Version: 3.4 +Implementation: tornadovm; (Java 11.0.11;GraalVM Community; home=~/graalvm-ce-java11-21.1.0) +Running all 100 times +Precision: double +Array size: 0.5 MB (=0.0 GB) +Total size: 1.6 MB (=0.0 GB) +Using TornadoVM device: + - Name : NVIDIA GeForce GT 710 CL_DEVICE_TYPE_GPU (available) + - Id : opencl-0-0 + - Platform : NVIDIA CUDA + - Backend : OpenCL +Function MBytes/sec Min (sec) Max Average +Copy 8791.100 0.00012 0.00079 0.00015 +Mul 8774.107 0.00012 0.00061 0.00014 +Add 9903.313 0.00016 0.00030 0.00018 +Triad 9861.031 0.00016 0.00030 0.00018 +Dot 2799.465 0.00037 0.00056 0.00041 +``` + diff --git a/src/java/java-stream/mvnw b/src/java/java-stream/mvnw new file mode 100755 index 0000000..5bf251c --- /dev/null +++ b/src/java/java-stream/mvnw @@ -0,0 +1,225 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven2 Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Migwn, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" + # TODO classpath? +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +echo $MAVEN_PROJECTBASEDIR +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/src/java/java-stream/mvnw.cmd b/src/java/java-stream/mvnw.cmd new file mode 100644 index 0000000..019bd74 --- /dev/null +++ b/src/java/java-stream/mvnw.cmd @@ -0,0 +1,143 @@ +@REM ---------------------------------------------------------------------------- +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM ---------------------------------------------------------------------------- + +@REM ---------------------------------------------------------------------------- +@REM Maven2 Start Up Batch script +@REM +@REM Required ENV vars: +@REM JAVA_HOME - location of a JDK home dir +@REM +@REM Optional ENV vars +@REM M2_HOME - location of maven2's installed home dir +@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands +@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending +@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven +@REM e.g. to debug Maven itself, use +@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files +@REM ---------------------------------------------------------------------------- + +@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' +@echo off +@REM enable echoing my setting MAVEN_BATCH_ECHO to 'on' +@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% + +@REM set %HOME% to equivalent of $HOME +if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") + +@REM Execute a user defined script before this one +if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre +@REM check for pre script, once with legacy .bat ending and once with .cmd ending +if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" +if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" +:skipRcPre + +@setlocal + +set ERROR_CODE=0 + +@REM To isolate internal variables from possible post scripts, we use another setlocal +@setlocal + +@REM ==== START VALIDATION ==== +if not "%JAVA_HOME%" == "" goto OkJHome + +echo. +echo Error: JAVA_HOME not found in your environment. >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +:OkJHome +if exist "%JAVA_HOME%\bin\java.exe" goto init + +echo. +echo Error: JAVA_HOME is set to an invalid directory. >&2 +echo JAVA_HOME = "%JAVA_HOME%" >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +@REM ==== END VALIDATION ==== + +:init + +@REM Find the project base dir, i.e. the directory that contains the folder ".mvn". +@REM Fallback to current working directory if not found. + +set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% +IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir + +set EXEC_DIR=%CD% +set WDIR=%EXEC_DIR% +:findBaseDir +IF EXIST "%WDIR%"\.mvn goto baseDirFound +cd .. +IF "%WDIR%"=="%CD%" goto baseDirNotFound +set WDIR=%CD% +goto findBaseDir + +:baseDirFound +set MAVEN_PROJECTBASEDIR=%WDIR% +cd "%EXEC_DIR%" +goto endDetectBaseDir + +:baseDirNotFound +set MAVEN_PROJECTBASEDIR=%EXEC_DIR% +cd "%EXEC_DIR%" + +:endDetectBaseDir + +IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig + +@setlocal EnableExtensions EnableDelayedExpansion +for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a +@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% + +:endReadAdditionalConfig + +SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" + +set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" +set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +%MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* +if ERRORLEVEL 1 goto error +goto end + +:error +set ERROR_CODE=1 + +:end +@endlocal & set ERROR_CODE=%ERROR_CODE% + +if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost +@REM check for post script, once with legacy .bat ending and once with .cmd ending +if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" +if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" +:skipRcPost + +@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' +if "%MAVEN_BATCH_PAUSE%" == "on" pause + +if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% + +exit /B %ERROR_CODE% diff --git a/src/java/java-stream/pom.xml b/src/java/java-stream/pom.xml new file mode 100644 index 0000000..ffaee72 --- /dev/null +++ b/src/java/java-stream/pom.xml @@ -0,0 +1,133 @@ + + + + 4.0.0 + + java-stream + javastream + 3.4.0 + + + UTF-8 + UTF-8 + 5.7.2 + + + + + universityOfManchester-graal + https://raw.githubusercontent.com/beehive-lab/tornado/maven-tornadovm + + + + + + + com.beust + jcommander + 1.81 + + + + tornado + tornado-api + 0.9 + + + + com.aparapi + aparapi + 2.0.0 + + + + org.scala-lang + scala-library + + + + + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + test + + + org.junit.jupiter + junit-jupiter-params + ${junit.version} + test + + + + + + + + maven-compiler-plugin + 3.8.1 + + 1.8 + 1.8 + -Xlint:all + true + true + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M5 + + + + + maven-shade-plugin + 3.2.4 + + + package + + shade + + + + + javastream.Main + + + + + *:* + + META-INF/*.MF + + + + ${project.artifactId} + + + + + + + com.coveo + fmt-maven-plugin + 2.9.1 + + + + format + + + + + + + + + \ No newline at end of file diff --git a/src/java/java-stream/src/main/java/javastream/FractionalMaths.java b/src/java/java-stream/src/main/java/javastream/FractionalMaths.java new file mode 100644 index 0000000..982a28a --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/FractionalMaths.java @@ -0,0 +1,45 @@ +package javastream; + +/** + * This class represents our Fractional typeclass. Java's type system isn't unified so we have to do + * insane things for parametric operations on fractional types. + */ +@SuppressWarnings("unchecked") +public final class FractionalMaths { + + private FractionalMaths() { + throw new AssertionError(); + } + + public static T from(Class evidence, Number n) { + if (evidence == Double.TYPE || evidence == Double.class) + return (T) Double.valueOf(n.doubleValue()); + else if (evidence == Float.TYPE || evidence == Float.class) + return (T) Float.valueOf(n.floatValue()); + throw new IllegalArgumentException(); + } + + public static T plus(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() + y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() + y.floatValue()); + throw new IllegalArgumentException(); + } + + static T minus(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() - y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() - y.floatValue()); + throw new IllegalArgumentException(); + } + + public static T times(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() * y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() * y.floatValue()); + throw new IllegalArgumentException(); + } + + static T divide(T x, T y) { + if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() / y.doubleValue()); + else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() / y.floatValue()); + throw new IllegalArgumentException(); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/JavaStream.java b/src/java/java-stream/src/main/java/javastream/JavaStream.java new file mode 100644 index 0000000..7ab96cb --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/JavaStream.java @@ -0,0 +1,172 @@ +package javastream; + +import java.time.Duration; +import java.util.AbstractMap; +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javastream.Main.Config; + +public abstract class JavaStream { + + public static final class Data { + final T[] a, b, c; + + public Data(T[] a, T[] b, T[] c) { + this.a = Objects.requireNonNull(a); + this.b = Objects.requireNonNull(b); + this.c = Objects.requireNonNull(c); + } + } + + static final class Timings { + final List copy = new ArrayList<>(); + final List mul = new ArrayList<>(); + final List add = new ArrayList<>(); + final List triad = new ArrayList<>(); + final List dot = new ArrayList<>(); + } + + protected final Config config; + + protected JavaStream(Config config) { + this.config = config; + } + + protected abstract List listDevices(); + + protected abstract void initArrays(); + + protected abstract void copy(); + + protected abstract void mul(); + + protected abstract void add(); + + protected abstract void triad(); + + protected abstract void nstream(); + + protected abstract T dot(); + + protected abstract Data data(); + + public static class EnumeratedStream extends JavaStream { + + protected final JavaStream actual; + private final Entry, JavaStream>>[] options; + + @SafeVarargs + @SuppressWarnings("varargs") + public EnumeratedStream( + Config config, Entry, JavaStream>>... options) { + super(config); + this.actual = options[config.options.device].getValue().apply(config); + this.options = options; + } + + @Override + protected List listDevices() { + return Arrays.stream(options).map(Entry::getKey).collect(Collectors.toList()); + } + + @Override + public void initArrays() { + actual.initArrays(); + } + + @Override + public void copy() { + actual.copy(); + } + + @Override + public void mul() { + actual.mul(); + } + + @Override + public void add() { + actual.add(); + } + + @Override + public void triad() { + actual.triad(); + } + + @Override + public void nstream() { + actual.nstream(); + } + + @Override + public T dot() { + return actual.dot(); + } + + @Override + public Data data() { + return actual.data(); + } + } + + public static Double[] boxed(double[] xs) { + return Arrays.stream(xs).boxed().toArray(Double[]::new); + } + + public static Float[] boxed(float[] xs) { + return IntStream.range(0, xs.length).mapToObj(i -> xs[i]).toArray(Float[]::new); + } + + private static AbstractMap.SimpleImmutableEntry timed(Supplier f) { + long start = System.nanoTime(); + T r = f.get(); + long end = System.nanoTime(); + return new AbstractMap.SimpleImmutableEntry<>(Duration.ofNanos(end - start), r); + } + + private static Duration timed(Runnable f) { + long start = System.nanoTime(); + f.run(); + long end = System.nanoTime(); + return Duration.ofNanos(end - start); + } + + final SimpleImmutableEntry, T> runAll(int times) { + Timings timings = new Timings<>(); + T lastSum = null; + for (int i = 0; i < times; i++) { + timings.copy.add(timed(this::copy)); + timings.mul.add(timed(this::mul)); + timings.add.add(timed(this::add)); + timings.triad.add(timed(this::triad)); + SimpleImmutableEntry dot = timed(this::dot); + timings.dot.add(dot.getKey()); + lastSum = dot.getValue(); + } + return new SimpleImmutableEntry<>(timings, lastSum); + } + + final Duration runTriad(int times) { + return timed( + () -> { + for (int i = 0; i < times; i++) { + triad(); + } + }); + } + + final List runNStream(int times) { + return IntStream.range(0, times) + .mapToObj(i -> timed(this::nstream)) + .collect(Collectors.toList()); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/Main.java b/src/java/java-stream/src/main/java/javastream/Main.java new file mode 100644 index 0000000..32b67a4 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/Main.java @@ -0,0 +1,425 @@ +package javastream; + +import static javastream.FractionalMaths.divide; +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.minus; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import java.time.Duration; +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.Arrays; +import java.util.DoubleSummaryStatistics; +import java.util.List; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import java.util.stream.Collectors; +import javastream.JavaStream.Data; +import javastream.JavaStream.Timings; +import javastream.aparapi.AparapiStreams; +import javastream.jdk.JdkStreams; +import javastream.jdk.PlainStream; +import javastream.tornadovm.TornadoVMStreams; + +public class Main { + + enum Benchmark { + NSTREAM, + TRIAD, + ALL + } + + public static class Options { + + @Parameter(names = "--list", description = "List available devices for all implementations") + boolean list = false; + + @Parameter( + names = "--device", + description = "Select device at , see --list for options") + public int device = 0; + + @Parameter( + names = "--impl", + description = "Select implementation at , see --list for options") + public String impl = ""; + + @Parameter( + names = {"--numtimes", "-n"}, + description = "Run the test times (NUM >= 2)") + public int numtimes = 100; + + @Parameter( + names = {"--arraysize", "-s"}, + description = "Use elements in the array") + public int arraysize = 33554432; + + @Parameter(names = "--float", description = "Use floats (rather than doubles)") + public boolean useFloat = false; + + @Parameter(names = "--triad-only", description = "Only run triad") + public boolean triadOnly = false; + + @Parameter(names = "--nstream-only", description = "Only run nstream") + public boolean nstreamOnly = false; + + @Parameter(names = "--csv", description = "Output as csv table") + public boolean csv = false; + + @Parameter( + names = "--mibibytes", + description = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)") + public boolean mibibytes = false; + + @Parameter(names = "--dot-tolerance", description = "Tolerance for dot kernel verification") + public double dotTolerance = 1.0e-8; + + public boolean isVerboseBenchmark() { + return !list && !csv; + } + } + + public static final class Config { + public final Options options; + public final Benchmark benchmark; + public final int typeSize; + public final Class evidence; + public final T ulp, scalar, initA, initB, initC; + + public Config( + Options options, + Benchmark benchmark, + int typeSize, + Class evidence, + T ulp, + T scalar, + T initA, + T initB, + T initC) { + this.options = Objects.requireNonNull(options); + this.benchmark = Objects.requireNonNull(benchmark); + this.typeSize = typeSize; + this.evidence = Objects.requireNonNull(evidence); + this.ulp = Objects.requireNonNull(ulp); + this.scalar = Objects.requireNonNull(scalar); + this.initA = Objects.requireNonNull(initA); + this.initB = Objects.requireNonNull(initB); + this.initC = Objects.requireNonNull(initC); + } + } + + static final class Implementation { + final String name; + final Function, JavaStream> makeFloat; + final Function, JavaStream> makeDouble; + + Implementation( + String name, + Function, JavaStream> makeFloat, + Function, JavaStream> makeDouble) { + this.name = Objects.requireNonNull(name); + this.makeFloat = Objects.requireNonNull(makeFloat); + this.makeDouble = Objects.requireNonNull(makeDouble); + } + } + + static boolean run( + String name, Config config, Function, JavaStream> mkStream) { + + Options opt = config.options; + + int arrayBytes = opt.arraysize * config.typeSize; + int totalBytes = arrayBytes * 3; + + String megaSuffix = opt.mibibytes ? "MiB" : "MB"; + String gigaSuffix = opt.mibibytes ? "GiB" : "GB"; + + double megaScale = opt.mibibytes ? Math.pow(2.0, -20) : 1.0e-6; + double gigaScale = opt.mibibytes ? Math.pow(2.0, -30) : 1.0e-9; + + if (!opt.csv) { + + String vendor = System.getProperty("java.vendor"); + String ver = System.getProperty("java.version"); + String home = System.getProperty("java.home"); + + System.out.println("BabelStream"); + System.out.printf("Version: %s%n", VERSION); + System.out.printf( + "Implementation: %s (Java %s; %s; JAVA_HOME=%s)%n", name, ver, vendor, home); + final String benchmarkName; + switch (config.benchmark) { + case NSTREAM: + benchmarkName = "nstream"; + break; + case TRIAD: + benchmarkName = "triad"; + break; + case ALL: + benchmarkName = "all"; + break; + default: + throw new AssertionError("Unexpected value: " + config.benchmark); + } + System.out.println("Running " + benchmarkName + " " + opt.numtimes + " times"); + + if (config.benchmark == Benchmark.TRIAD) { + System.out.println("Number of elements: " + opt.arraysize); + } + + System.out.println("Precision: " + (opt.useFloat ? "float" : "double")); + System.out.printf( + "Array size: %.1f %s (=%.1f %s)%n", + (megaScale * arrayBytes), megaSuffix, (gigaScale * arrayBytes), gigaSuffix); + System.out.printf( + "Total size: %.1f %s (=%.1f %s)%n", + (megaScale * totalBytes), megaSuffix, (gigaScale * totalBytes), gigaSuffix); + } + + JavaStream stream = mkStream.apply(config); + + stream.initArrays(); + + final boolean ok; + switch (config.benchmark) { + case ALL: + Entry, T> results = stream.runAll(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.of(results.getValue())); + Timings timings = results.getKey(); + tabulateCsv( + opt.csv, + mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); + break; + case NSTREAM: + List nstreamResults = stream.runNStream(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.empty()); + tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); + break; + case TRIAD: + Duration triadResult = stream.runTriad(opt.numtimes); + ok = checkSolutions(stream.data(), config, Optional.empty()); + int triadTotalBytes = 3 * arrayBytes * opt.numtimes; + double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); + System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); + System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); + break; + default: + throw new AssertionError(); + } + return ok; + } + + private static boolean checkWithinTolerance( + String name, T[] xs, T gold, T tolerance) { + // it's ok to default to double for error calculation + double error = + Arrays.stream(xs) + .mapToDouble(x -> Math.abs(minus(x, gold).doubleValue())) + .summaryStatistics() + .getAverage(); + boolean failed = error > tolerance.doubleValue(); + if (failed) { + System.err.printf("Validation failed on %s. Average error %s%n", name, error); + } + return !failed; + } + + @SuppressWarnings("OptionalUsedAsFieldOrParameterType") + static boolean checkSolutions( + Data data, Config config, Optional dotSum) { + T goldA = config.initA; + T goldB = config.initB; + T goldC = config.initC; + + for (int i = 0; i < config.options.numtimes; i++) { + switch (config.benchmark) { + case ALL: + goldC = goldA; + goldB = times(config.scalar, goldC); + goldC = plus(goldA, goldB); + goldA = plus(goldB, times(config.scalar, goldC)); + break; + case TRIAD: + goldA = plus(goldB, times(config.scalar, goldC)); + break; + case NSTREAM: + goldA = plus(goldA, plus(goldB, times(config.scalar, goldC))); + break; + } + } + + T tolerance = times(config.ulp, from(config.evidence, 100)); + boolean aValid = checkWithinTolerance("a", data.a, goldA, tolerance); + boolean bValid = checkWithinTolerance("b", data.b, goldB, tolerance); + boolean cValid = checkWithinTolerance("c", data.c, goldC, tolerance); + + final T finalGoldA = goldA; + final T finalGoldB = goldB; + boolean sumValid = + dotSum + .map( + actual -> { + T goldSum = + times( + times(finalGoldA, finalGoldB), + from(config.evidence, config.options.arraysize)); + double error = Math.abs(divide(minus(actual, goldSum), goldSum).doubleValue()); + boolean failed = error > config.options.dotTolerance; + if (failed) { + System.err.printf( + "Validation failed on sum. Error %s \nSum was %s but should be %s%n", + error, actual, goldSum); + } + return !failed; + }) + .orElse(true); + + return aValid && bValid && cValid && sumValid; + } + + private static double durationToSeconds(Duration d) { + return d.toNanos() / (double) TimeUnit.SECONDS.toNanos(1); + } + + private static List> mkCsvRow( + List xs, String name, int totalBytes, double megaScale, Options opt) { + DoubleSummaryStatistics stats = + xs.stream().skip(1).mapToDouble(Main::durationToSeconds).summaryStatistics(); + if (stats.getCount() <= 0) { + throw new IllegalArgumentException("No min/max for " + name + "(size=" + totalBytes + ")"); + } + double mbps = megaScale * (double) totalBytes / stats.getMin(); + return opt.csv + ? Arrays.asList( + new SimpleImmutableEntry<>("function", name), + new SimpleImmutableEntry<>("num_times", opt.numtimes + ""), + new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""), + new SimpleImmutableEntry<>("sizeof", totalBytes + ""), + new SimpleImmutableEntry<>( + "max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", mbps + ""), + new SimpleImmutableEntry<>("min_runtime", stats.getMin() + ""), + new SimpleImmutableEntry<>("max_runtime", stats.getMax() + ""), + new SimpleImmutableEntry<>("avg_runtime", stats.getAverage() + "")) + : Arrays.asList( + new SimpleImmutableEntry<>("Function", name), + new SimpleImmutableEntry<>( + "M" + (opt.mibibytes ? "i" : "") + "Bytes/sec", String.format("%.3f", mbps)), + new SimpleImmutableEntry<>("Min (sec)", String.format("%.5f", stats.getMin())), + new SimpleImmutableEntry<>("Max", String.format("%.5f", stats.getMax())), + new SimpleImmutableEntry<>("Average", String.format("%.5f", stats.getAverage()))); + } + + private static String padSpace(String s, int length) { + if (length == 0) return s; + return String.format("%1$-" + length + "s", s); + } + + @SafeVarargs + @SuppressWarnings("varargs") + private static void tabulateCsv(boolean csv, List>... rows) { + if (rows.length == 0) throw new IllegalArgumentException("Empty tabulation"); + int padding = csv ? 0 : 12; + String sep = csv ? "," : ""; + System.out.println( + rows[0].stream().map(x -> padSpace(x.getKey(), padding)).collect(Collectors.joining(sep))); + for (List> row : rows) { + System.out.println( + row.stream().map(x -> padSpace(x.getValue(), padding)).collect(Collectors.joining(sep))); + } + } + + private static final String VERSION = "3.4"; + + private static final float START_SCALAR = 0.4f; + private static final float START_A = 0.1f; + private static final float START_B = 0.2f; + private static final float START_C = 0.0f; + + private static final List IMPLEMENTATIONS = + Arrays.asList( + new Implementation("jdk-stream", JdkStreams.FLOAT, JdkStreams.DOUBLE), + new Implementation("jdk-plain", PlainStream.FLOAT, PlainStream.DOUBLE), + new Implementation("tornadovm", TornadoVMStreams.FLOAT, TornadoVMStreams.DOUBLE), + new Implementation("aparapi", AparapiStreams.FLOAT, AparapiStreams.DOUBLE)); + + public static int run(String[] args) { + Options opt = new Options(); + JCommander.newBuilder().addObject(opt).build().parse(args); + + final Benchmark benchmark; + if (opt.nstreamOnly && opt.triadOnly) + throw new RuntimeException( + "Both triad and nstream are enabled, pick one or omit both to run all benchmarks"); + else if (opt.nstreamOnly) benchmark = Benchmark.NSTREAM; + else if (opt.triadOnly) benchmark = Benchmark.TRIAD; + else benchmark = Benchmark.ALL; + + final Config floatConfig = + new Config<>( + opt, + benchmark, + Float.BYTES, + Float.class, // XXX not Float.TYPE, we want the boxed one + Math.ulp(1.f), + START_SCALAR, + START_A, + START_B, + START_C); + final Config doubleConfig = + new Config<>( + opt, + benchmark, + Double.BYTES, + Double.class, // XXX not Double.TYPE, we want the boxed one + Math.ulp(1.d), + (double) START_SCALAR, + (double) START_A, + (double) START_B, + (double) START_C); + + if (opt.list) { + System.out.println("Set implementation with --impl and device with --device :"); + for (Implementation entry : IMPLEMENTATIONS) { + System.out.println("Implementation: " + entry.name); + try { + List devices = entry.makeDouble.apply(doubleConfig).listDevices(); + for (int i = 0; i < devices.size(); i++) { + System.out.println("\t[" + i + "] " + devices.get(i)); + } + } catch (Exception e) { + System.out.println("\t(Unsupported: " + e.getMessage() + ")"); + } + } + return 0; + } + + String implName = (opt.impl.isEmpty()) ? IMPLEMENTATIONS.get(0).name : opt.impl; + Implementation impl = + IMPLEMENTATIONS.stream() + .filter(x -> implName.compareToIgnoreCase(x.name) == 0) + .findFirst() + .orElseThrow( + () -> + new IllegalArgumentException("Implementation " + opt.impl + " does not exist")); + + boolean ok = + opt.useFloat + ? run(impl.name, floatConfig, impl.makeFloat) + : run(impl.name, doubleConfig, impl.makeDouble); + + return ok ? 0 : 1; + } + + public static void main(String[] args) { + System.exit(run(args)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java new file mode 100644 index 0000000..ab2de52 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java @@ -0,0 +1,129 @@ +package javastream.aparapi; + +import com.aparapi.device.Device; +import com.aparapi.device.Device.TYPE; +import com.aparapi.device.JavaDevice; +import com.aparapi.device.OpenCLDevice; +import com.aparapi.internal.kernel.KernelManager; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javastream.JavaStream; +import javastream.Main.Config; + +public final class AparapiStreams { + + private AparapiStreams() {} + + public static final Function, JavaStream> DOUBLE = + config -> new Generic<>(config, SpecialisedDoubleKernel::new); + + public static final Function, JavaStream> FLOAT = + config -> new Generic<>(config, SpecialisedFloatKernel::new); + + private static List enumerateDevices() { + + // JavaDevice.SEQUENTIAL doesn't work when arraysize > 1, so we omit it entirely + Stream cpuDevices = Stream.of(JavaDevice.ALTERNATIVE_ALGORITHM); + + Stream clDevices = + Stream.of(TYPE.values()).map(OpenCLDevice::listDevices).flatMap(Collection::stream); + + return Stream.concat(clDevices, cpuDevices).collect(Collectors.toList()); + } + + private static String deviceName(Device device) { + return device.toString(); + } + + private static final class Generic extends JavaStream { + + private final GenericAparapiStreamKernel kernels; + + Generic(Config config, GenericAparapiStreamKernel.Factory factory) { + super(config); + Device device = enumerateDevices().get(config.options.device); + + final int numGroups; + final int workGroupSize; + if (device instanceof JavaDevice) { + numGroups = Runtime.getRuntime().availableProcessors(); + workGroupSize = + config.typeSize * 2; // closest thing to CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE + + } else if (device instanceof OpenCLDevice) { + numGroups = ((OpenCLDevice) device).getMaxComputeUnits(); + workGroupSize = device.getMaxWorkGroupSize(); + } else { + throw new AssertionError("Unknown device type " + device.getClass()); + } + + if (config.options.isVerboseBenchmark()) { + System.out.println("Using Aparapi OpenCL device: " + device); + System.out.println(" - numGroups : " + numGroups); + System.out.println(" - workGroupSize : " + workGroupSize); + String showCL = System.getProperty("com.aparapi.enableShowGeneratedOpenCL"); + if (showCL == null || !showCL.equals("true")) { + System.out.println( + "(Add `-Dcom.aparapi.enableShowGeneratedOpenCL=true` to show generated OpenCL source)"); + } + } + + LinkedHashSet candidate = new LinkedHashSet<>(); + candidate.add(device); + + kernels = factory.create(config, numGroups, workGroupSize); + KernelManager.instance().setPreferredDevices(kernels, candidate); + } + + @Override + public List listDevices() { + return enumerateDevices().stream() + .map(AparapiStreams::deviceName) + .collect(Collectors.toList()); + } + + @Override + public void initArrays() { + kernels.init(); + } + + @Override + public void copy() { + kernels.copy(); + } + + @Override + public void mul() { + kernels.mul(); + } + + @Override + public void add() { + kernels.add(); + } + + @Override + public void triad() { + kernels.triad(); + } + + @Override + public void nstream() { + kernels.nstream(); + } + + @Override + public T dot() { + return kernels.dot(); + } + + @Override + public Data data() { + return kernels.syncAndDispose(); + } + } +} diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java b/src/java/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java new file mode 100644 index 0000000..526b472 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java @@ -0,0 +1,68 @@ +package javastream.aparapi; + +import com.aparapi.Kernel; +import com.aparapi.Range; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +abstract class GenericAparapiStreamKernel extends Kernel { + + protected static final int FN_COPY = 1; + protected static final int FN_MUL = 2; + protected static final int FN_ADD = 3; + protected static final int FN_TRIAD = 4; + protected static final int FN_NSTREAM = 5; + protected static final int FN_DOT = 6; + protected final Config config; + protected final int arraysize, numGroups, workGroupSize; + + interface Factory { + GenericAparapiStreamKernel create(Config config, int numGroups, int workGroupSize); + } + + GenericAparapiStreamKernel(Config config, int numGroups, int workGroupSize) { + this.config = config; + this.arraysize = config.options.arraysize; + this.numGroups = numGroups; + this.workGroupSize = workGroupSize; + setExplicit(true); + } + + protected int function; + + public abstract void init(); + + public void copy() { + function = FN_COPY; + execute(arraysize); + } + + public void mul() { + function = FN_MUL; + execute(arraysize); + } + + public void add() { + function = FN_ADD; + execute(arraysize); + } + + public void triad() { + function = FN_TRIAD; + execute(arraysize); + } + + public void nstream() { + function = FN_NSTREAM; + execute(arraysize); + } + + protected Kernel partialDot() { + function = FN_DOT; + return execute(Range.create(numGroups * workGroupSize, workGroupSize)); + } + + abstract T dot(); + + abstract Data syncAndDispose(); +} diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java b/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java new file mode 100644 index 0000000..56a59af --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java @@ -0,0 +1,74 @@ +package javastream.aparapi; + +import java.util.Arrays; +import javastream.JavaStream; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +final class SpecialisedDoubleKernel extends GenericAparapiStreamKernel { + private final double scalar; + final double[] a, b, c; + private final double[] partialSum; + @Local private final double[] workGroupSum; + + SpecialisedDoubleKernel(Config config, int numGroups, int workGroupSize) { + super(config, numGroups, workGroupSize); + this.scalar = config.scalar; + this.a = new double[this.arraysize]; + this.b = new double[this.arraysize]; + this.c = new double[this.arraysize]; + + this.partialSum = new double[numGroups]; + this.workGroupSum = new double[workGroupSize]; + } + + @SuppressWarnings("DuplicatedCode") + @Override + public void run() { + int i = getGlobalId(); + if (function == FN_COPY) { + c[i] = a[i]; + } else if (function == FN_MUL) { + b[i] = scalar * c[i]; + } else if (function == FN_ADD) { + c[i] = a[i] + b[i]; + } else if (function == FN_TRIAD) { + a[i] = b[i] + scalar * c[i]; + } else if (function == FN_NSTREAM) { + a[i] += b[i] + scalar * c[i]; + } else if (function == FN_DOT) { + int localId = getLocalId(0); + workGroupSum[localId] = 0.0; + for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; + for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { + localBarrier(); + if (localId < offset) { + workGroupSum[localId] += workGroupSum[localId + offset]; + } + } + if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; + } + } + + @Override + public void init() { + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + put(a).put(b).put(c); + } + + @Override + public Double dot() { + partialDot().get(partialSum); + double sum = 0; + for (double v : partialSum) sum += v; + return sum; + } + + @Override + public Data syncAndDispose() { + get(a).get(b).get(c).dispose(); + return new Data<>(JavaStream.boxed(a), JavaStream.boxed(b), JavaStream.boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java b/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java new file mode 100644 index 0000000..6919f06 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java @@ -0,0 +1,75 @@ +package javastream.aparapi; + +import static javastream.JavaStream.boxed; + +import java.util.Arrays; +import javastream.JavaStream.Data; +import javastream.Main.Config; + +final class SpecialisedFloatKernel extends GenericAparapiStreamKernel { + private final float scalar; + final float[] a, b, c; + private final float[] partialSum; + @Local private final float[] workGroupSum; + + SpecialisedFloatKernel(Config config, int numGroups, int workGroupSize) { + super(config, numGroups, workGroupSize); + this.scalar = config.scalar; + this.a = new float[this.arraysize]; + this.b = new float[this.arraysize]; + this.c = new float[this.arraysize]; + + this.partialSum = new float[numGroups]; + this.workGroupSum = new float[workGroupSize]; + } + + @SuppressWarnings("DuplicatedCode") + @Override + public void run() { + int i = getGlobalId(); + if (function == FN_COPY) { + c[i] = a[i]; + } else if (function == FN_MUL) { + b[i] = scalar * c[i]; + } else if (function == FN_ADD) { + c[i] = a[i] + b[i]; + } else if (function == FN_TRIAD) { + a[i] = b[i] + scalar * c[i]; + } else if (function == FN_NSTREAM) { + a[i] += b[i] + scalar * c[i]; + } else if (function == FN_DOT) { + int localId = getLocalId(0); + workGroupSum[localId] = 0.f; + for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; + for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { + localBarrier(); + if (localId < offset) { + workGroupSum[localId] += workGroupSum[localId + offset]; + } + } + if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; + } + } + + @Override + public void init() { + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + put(a).put(b).put(c); + } + + @Override + public Float dot() { + partialDot().get(partialSum); + float sum = 0; + for (float v : partialSum) sum += v; + return sum; + } + + @Override + public Data syncAndDispose() { + get(a).get(b).get(c).dispose(); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java new file mode 100644 index 0000000..7f210fa --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java @@ -0,0 +1,92 @@ +package javastream.jdk; + +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import java.lang.reflect.Array; +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class GenericPlainStream extends JavaStream { + + private final T[] a; + private final T[] b; + private final T[] c; + + @SuppressWarnings("unchecked") + GenericPlainStream(Config config) { + super(config); + this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = times(config.scalar, c[i]); + } + } + + @Override + public void add() { + + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = plus(a[i], b[i]); + } + } + + @Override + public void triad() { + + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = plus(b[i], times(config.scalar, c[i])); + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = plus(a[i], plus(b[i], times(config.scalar, c[i]))); + } + } + + @Override + public T dot() { + T acc = from(config.evidence, 0); + for (int i = 0; i < config.options.arraysize; i++) { + acc = plus(acc, times(a[i], b[i])); + } + return acc; + } + + @Override + public Data data() { + return new Data<>(a, b, c); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java new file mode 100644 index 0000000..1e65b8f --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java @@ -0,0 +1,86 @@ +package javastream.jdk; + +import static javastream.FractionalMaths.from; +import static javastream.FractionalMaths.plus; +import static javastream.FractionalMaths.times; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.FractionalMaths; +import javastream.JavaStream; +import javastream.Main.Config; + +/** + * We use + * + *
Arrays.parallelSetAll
+ * + *

here as it internally calls + * + *

IntStream.range(0, array.length).parallel().forEach(...)
+ */ +final class GenericStream extends JavaStream { + + private final T[] a, b, c; + + @SuppressWarnings("unchecked") + GenericStream(Config config) { + super(config); + this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + Arrays.parallelSetAll(a, i -> config.initA); + Arrays.parallelSetAll(b, i -> config.initB); + Arrays.parallelSetAll(c, i -> config.initC); + } + + @Override + public void copy() { + Arrays.parallelSetAll(c, i -> a[i]); + } + + @Override + public void mul() { + Arrays.parallelSetAll(b, i -> times(config.scalar, c[i])); + } + + @Override + public void add() { + Arrays.parallelSetAll(c, i -> plus(a[i], b[i])); + } + + @Override + public void triad() { + Arrays.parallelSetAll(a, i -> plus(b[i], times(config.scalar, c[i]))); + } + + @Override + public void nstream() { + Arrays.parallelSetAll(a, i -> plus(a[i], plus(b[i], times(config.scalar, c[i])))); + } + + @Override + public T dot() { + return IntStream.range(0, config.options.arraysize) + .parallel() + .mapToObj(i -> times(a[i], b[i])) + .reduce(from(config.evidence, 0), FractionalMaths::plus); + } + + @Override + public Data data() { + return new Data<>(a, b, c); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/JdkStreams.java b/src/java/java-stream/src/main/java/javastream/jdk/JdkStreams.java new file mode 100644 index 0000000..5b58be7 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/JdkStreams.java @@ -0,0 +1,26 @@ +package javastream.jdk; + +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.function.Function; +import javastream.JavaStream; +import javastream.JavaStream.EnumeratedStream; +import javastream.Main.Config; + +public final class JdkStreams { + + private JdkStreams() {} + + public static final Function, JavaStream> FLOAT = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedFloatStream::new), + new SimpleImmutableEntry<>("generic", GenericStream::new)); + + public static final Function, JavaStream> DOUBLE = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedDoubleStream::new), + new SimpleImmutableEntry<>("generic", GenericStream::new)); +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/PlainStream.java b/src/java/java-stream/src/main/java/javastream/jdk/PlainStream.java new file mode 100644 index 0000000..f9281e8 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/PlainStream.java @@ -0,0 +1,26 @@ +package javastream.jdk; + +import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.function.Function; +import javastream.JavaStream; +import javastream.JavaStream.EnumeratedStream; +import javastream.Main.Config; + +public final class PlainStream { + + private PlainStream() {} + + public static final Function, JavaStream> FLOAT = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedPlainFloatStream::new), + new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); + + public static final Function, JavaStream> DOUBLE = + config -> + new EnumeratedStream<>( + config, + new SimpleImmutableEntry<>("specialised", SpecialisedPlainDoubleStream::new), + new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java new file mode 100644 index 0000000..26406a6 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedDoubleStream extends JavaStream { + + private final double[] a, b, c; + + SpecialisedDoubleStream(Config config) { + super(config); + this.a = new double[config.options.arraysize]; + this.b = new double[config.options.arraysize]; + this.c = new double[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach( + i -> { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + }); + } + + @Override + public void copy() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i]); + } + + @Override + public void mul() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> b[i] = config.scalar * c[i]); + } + + @Override + public void add() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i] + b[i]); + } + + @Override + public void triad() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] = b[i] + config.scalar * c[i]); + } + + @Override + public void nstream() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] += b[i] + config.scalar * c[i]); + } + + @Override + public Double dot() { + return IntStream.range(0, config.options.arraysize) + .parallel() + .mapToDouble(i -> a[i] * b[i]) + .reduce(0f, Double::sum); + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java new file mode 100644 index 0000000..6c414c1 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedFloatStream extends JavaStream { + + private final float[] a, b, c; + + SpecialisedFloatStream(Config config) { + super(config); + this.a = new float[config.options.arraysize]; + this.b = new float[config.options.arraysize]; + this.c = new float[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach( + i -> { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + }); + } + + @Override + public void copy() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i]); + } + + @Override + public void mul() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> b[i] = config.scalar * c[i]); + } + + @Override + public void add() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> c[i] = a[i] + b[i]); + } + + @Override + public void triad() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] = b[i] + config.scalar * c[i]); + } + + @Override + public void nstream() { + IntStream.range(0, config.options.arraysize) // + .parallel() + .forEach(i -> a[i] += b[i] + config.scalar * c[i]); + } + + @Override + public Float dot() { + return IntStream.range(0, config.options.arraysize) // + .parallel() + .mapToObj(i -> a[i] * b[i]) // XXX there isn't a specialised Stream for floats + .reduce(0f, Float::sum); + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java new file mode 100644 index 0000000..afda2ef --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedPlainDoubleStream extends JavaStream { + + private final double[] a; + private final double[] b; + private final double[] c; + + SpecialisedPlainDoubleStream(Config config) { + super(config); + this.a = new double[config.options.arraysize]; + this.b = new double[config.options.arraysize]; + this.c = new double[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = config.scalar * c[i]; + } + } + + @Override + public void add() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i] + b[i]; + } + } + + @Override + public void triad() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = b[i] + config.scalar * c[i]; + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] += b[i] + config.scalar * c[i]; + } + } + + @Override + public Double dot() { + double acc = 0f; + for (int i = 0; i < config.options.arraysize; i++) { + acc += a[i] * b[i]; + } + return acc; + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java new file mode 100644 index 0000000..9ccee53 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java @@ -0,0 +1,84 @@ +package javastream.jdk; + +import java.util.Collections; +import java.util.List; +import javastream.JavaStream; +import javastream.Main.Config; + +final class SpecialisedPlainFloatStream extends JavaStream { + + private final float[] a; + private final float[] b; + private final float[] c; + + SpecialisedPlainFloatStream(Config config) { + super(config); + this.a = new float[config.options.arraysize]; + this.b = new float[config.options.arraysize]; + this.c = new float[config.options.arraysize]; + } + + @Override + public List listDevices() { + return Collections.singletonList("JVM"); + } + + @Override + public void initArrays() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = config.initA; + b[i] = config.initB; + c[i] = config.initC; + } + } + + @SuppressWarnings("ManualArrayCopy") + @Override + public void copy() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i]; + } + } + + @Override + public void mul() { + for (int i = 0; i < config.options.arraysize; i++) { + b[i] = config.scalar * c[i]; + } + } + + @Override + public void add() { + for (int i = 0; i < config.options.arraysize; i++) { + c[i] = a[i] + b[i]; + } + } + + @Override + public void triad() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] = b[i] + config.scalar * c[i]; + } + } + + @Override + public void nstream() { + for (int i = 0; i < config.options.arraysize; i++) { + a[i] += b[i] + config.scalar * c[i]; + } + } + + @Override + public Float dot() { + float acc = 0f; + for (int i = 0; i < config.options.arraysize; i++) { + acc += a[i] * b[i]; + } + return acc; + } + + @Override + public Data data() { + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java new file mode 100644 index 0000000..d936df6 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java @@ -0,0 +1,98 @@ +package javastream.tornadovm; + +import java.util.List; +import java.util.stream.Collectors; +import javastream.JavaStream; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskSchedule; +import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.common.TornadoDevice; +import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; + +abstract class GenericTornadoVMStream extends JavaStream { + + protected final TornadoDevice device; + + protected TaskSchedule copyTask; + protected TaskSchedule mulTask; + protected TaskSchedule addTask; + protected TaskSchedule triadTask; + protected TaskSchedule nstreamTask; + protected TaskSchedule dotTask; + + GenericTornadoVMStream(Config config) { + super(config); + + try { + TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime(); + List devices = TornadoVMStreams.enumerateDevices(runtime); + device = devices.get(config.options.device); + + if (config.options.isVerboseBenchmark()) { + System.out.println("Using TornadoVM device:"); + System.out.println(" - Name : " + device.getDescription()); + System.out.println(" - Id : " + device.getDeviceName()); + System.out.println(" - Platform : " + device.getPlatformName()); + System.out.println(" - Backend : " + device.getTornadoVMBackend().name()); + } + } catch (Throwable e) { + throw new RuntimeException( + "Unable to initialise TornadoVM, make sure you are running the binary with the `tornado -jar ...` wrapper and not `java -jar ...`", + e); + } + } + + protected static TaskSchedule mkSchedule() { + return new TaskSchedule(""); + } + + @Override + public List listDevices() { + return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() + .map(d -> d.getDescription() + "(" + d.getDeviceName() + ")") + .collect(Collectors.toList()); + } + + @Override + public void initArrays() { + this.copyTask.warmup(); + this.mulTask.warmup(); + this.addTask.warmup(); + this.triadTask.warmup(); + this.nstreamTask.warmup(); + this.dotTask.warmup(); + } + + @Override + public void copy() { + this.copyTask.execute(); + } + + @Override + public void mul() { + this.mulTask.execute(); + } + + @Override + public void add() { + this.addTask.execute(); + } + + @Override + public void triad() { + this.triadTask.execute(); + } + + @Override + public void nstream() { + this.nstreamTask.execute(); + } + + protected abstract T getSum(); + + @Override + public T dot() { + this.dotTask.execute(); + return getSum(); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java new file mode 100644 index 0000000..7712e31 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java @@ -0,0 +1,88 @@ +package javastream.tornadovm; + +import java.util.Arrays; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.annotations.Parallel; +import uk.ac.manchester.tornado.api.annotations.Reduce; + +final class SpecialisedDouble extends GenericTornadoVMStream { + + @SuppressWarnings("ManualArrayCopy") + private static void copy(int size, double[] a, double[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i]; + } + } + + private static void mul(int size, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + b[i] = scalar * c[i]; + } + } + + private static void add(int size, double[] a, double[] b, double[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } + } + + private static void triad(int size, double[] a, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] + scalar * c[i]; + } + } + + private static void nstream(int size, double[] a, double[] b, double[] c, double scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] * scalar * c[i]; + } + } + + private static void dot_( + double[] a, double[] b, @Reduce double[] acc) { // prevent name clash with CL's dot + acc[0] = 0; + for (@Parallel int i = 0; i < a.length; i++) { + acc[0] += a[i] * b[i]; + } + } + + private final double[] a, b, c; + private final double[] dotSum; + + @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + SpecialisedDouble(Config config) { + super(config); + final int size = config.options.arraysize; + final double scalar = config.scalar; + a = new double[size]; + b = new double[size]; + c = new double[size]; + dotSum = new double[1]; + this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c); + this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar); + this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c); + this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar); + this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar); + this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum); + } + + @Override + public void initArrays() { + super.initArrays(); + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + TornadoVMStreams.xferToDevice(device, a, b, c); + } + + @Override + protected Double getSum() { + return dotSum[0]; + } + + @Override + public Data data() { + TornadoVMStreams.xferFromDevice(device, a, b, c); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java new file mode 100644 index 0000000..e61cfe9 --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java @@ -0,0 +1,88 @@ +package javastream.tornadovm; + +import java.util.Arrays; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.annotations.Parallel; +import uk.ac.manchester.tornado.api.annotations.Reduce; + +final class SpecialisedFloat extends GenericTornadoVMStream { + + @SuppressWarnings("ManualArrayCopy") + private static void copy(int size, float[] a, float[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i]; + } + } + + private static void mul(int size, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + b[i] = scalar * c[i]; + } + } + + private static void add(int size, float[] a, float[] b, float[] c) { + for (@Parallel int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } + } + + private static void triad(int size, float[] a, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] + scalar * c[i]; + } + } + + private static void nstream(int size, float[] a, float[] b, float[] c, float scalar) { + for (@Parallel int i = 0; i < size; i++) { + a[i] = b[i] * scalar * c[i]; + } + } + + private static void dot_( + float[] a, float[] b, @Reduce float[] acc) { // prevent name clash with CL's dot + acc[0] = 0; + for (@Parallel int i = 0; i < a.length; i++) { + acc[0] += a[i] * b[i]; + } + } + + private final float[] a, b, c; + private final float[] dotSum; + + @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + SpecialisedFloat(Config config) { + super(config); + final int size = config.options.arraysize; + final float scalar = config.scalar; + a = new float[size]; + b = new float[size]; + c = new float[size]; + dotSum = new float[1]; + this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c); + this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar); + this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c); + this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar); + this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar); + this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum); + } + + @Override + public void initArrays() { + super.initArrays(); + Arrays.fill(a, config.initA); + Arrays.fill(b, config.initB); + Arrays.fill(c, config.initC); + TornadoVMStreams.xferToDevice(device, a, b, c); + } + + @Override + protected Float getSum() { + return dotSum[0]; + } + + @Override + public Data data() { + TornadoVMStreams.xferFromDevice(device, a, b, c); + return new Data<>(boxed(a), boxed(b), boxed(c)); + } +} diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java new file mode 100644 index 0000000..68eecad --- /dev/null +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java @@ -0,0 +1,42 @@ +package javastream.tornadovm; + +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javastream.JavaStream; +import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.common.TornadoDevice; +import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState; +import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; + +public final class TornadoVMStreams { + + private TornadoVMStreams() {} + + static void xferToDevice(TornadoDevice device, Object... xs) { + for (Object x : xs) { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + List writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); + if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); + } + } + + static void xferFromDevice(TornadoDevice device, Object... xs) { + for (Object x : xs) { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn(); + } + } + + static List enumerateDevices(TornadoRuntimeCI runtime) { + return IntStream.range(0, runtime.getNumDrivers()) + .mapToObj(runtime::getDriver) + .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) + .collect(Collectors.toList()); + } + + public static final Function, JavaStream> FLOAT = SpecialisedFloat::new; + public static final Function, JavaStream> DOUBLE = SpecialisedDouble::new; +} diff --git a/src/java/java-stream/src/test/java/javastream/SmokeTest.java b/src/java/java-stream/src/test/java/javastream/SmokeTest.java new file mode 100644 index 0000000..2ceca44 --- /dev/null +++ b/src/java/java-stream/src/test/java/javastream/SmokeTest.java @@ -0,0 +1,93 @@ +package javastream; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class SmokeTest { + + // taken from https://stackoverflow.com/a/32146095/896997 + private static Stream> ofCombinations( + List> collections, List current) { + return collections.isEmpty() + ? Stream.of(current) + : collections.get(0).stream() + .flatMap( + e -> { + List list = new ArrayList<>(current); + list.add(e); + return ofCombinations(collections.subList(1, collections.size()), list); + }); + } + + @SuppressWarnings("unused") + private static Stream options() { + + LinkedHashMap> impls = new LinkedHashMap<>(); + impls.put("jdk-stream", Arrays.asList(0, 1)); + impls.put("jdk-plain", Arrays.asList(0, 1)); + // skip aparapi as none of the jdk fallbacks work correctly + // skip tornadovm as it has no jdk fallback + + List configs = + impls.entrySet().stream() + .flatMap( + e -> + Stream.concat(Stream.of(""), e.getValue().stream().map(i -> "--device " + i)) + .map(d -> "--impl " + e.getKey() + " " + d)) + .collect(Collectors.toList()); + + return ofCombinations( + new ArrayList<>( + Arrays.asList( + configs, + Arrays.asList("", "--csv"), + // XXX floats usually have a 1.0^-5 error which misses 10^-8 + Arrays.asList("", "--float --dot-tolerance 1.0e-5"), + Arrays.asList("", "--triad-only", "--nstream-only"), + Arrays.asList("", "--mibibytes"))), + Collections.emptyList()) + .map( + xs -> + Arguments.of( + xs.stream() // + .map(String::trim) // + .collect(Collectors.joining(" ")) + .trim())); + } + + @ParameterizedTest + @MethodSource("options") + void testIt(String args) { + String line = "--arraysize 2048 " + args; + + // redirect stdout/stderr and only print if anything fails + ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + ByteArrayOutputStream errContent = new ByteArrayOutputStream(); + PrintStream originalOut = System.out; + PrintStream originalErr = System.err; + + System.setOut(new PrintStream(outContent)); + System.setErr(new PrintStream(errContent)); + int run = Main.run(line.split("\\s+")); + System.setOut(originalOut); + System.setErr(originalErr); + + if (run != 0) { + System.out.println(outContent); + System.err.println(errContent); + Assertions.assertEquals(0, run, "`" + line + "` did not return 0"); + } + } +} diff --git a/src/julia/JuliaStream.jl/.JuliaFormatter.toml b/src/julia/JuliaStream.jl/.JuliaFormatter.toml new file mode 100644 index 0000000..ac95ddd --- /dev/null +++ b/src/julia/JuliaStream.jl/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +indent = 2 +margin = 100 \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/.gitignore b/src/julia/JuliaStream.jl/.gitignore new file mode 100644 index 0000000..12b143b --- /dev/null +++ b/src/julia/JuliaStream.jl/.gitignore @@ -0,0 +1,5 @@ +*.jl.cov +*.jl.*.cov +*.jl.mem +/docs/build/ +/docs/Manifest.toml \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml new file mode 100644 index 0000000..6525501 --- /dev/null +++ b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml @@ -0,0 +1,415 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/julia/JuliaStream.jl/AMDGPU/Project.toml b/src/julia/JuliaStream.jl/AMDGPU/Project.toml new file mode 100644 index 0000000..5ab8447 --- /dev/null +++ b/src/julia/JuliaStream.jl/AMDGPU/Project.toml @@ -0,0 +1,7 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/CUDA/Manifest.toml b/src/julia/JuliaStream.jl/CUDA/Manifest.toml new file mode 100644 index 0000000..ef6da14 --- /dev/null +++ b/src/julia/JuliaStream.jl/CUDA/Manifest.toml @@ -0,0 +1,316 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.4.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/julia/JuliaStream.jl/CUDA/Project.toml b/src/julia/JuliaStream.jl/CUDA/Project.toml new file mode 100644 index 0000000..e50582e --- /dev/null +++ b/src/julia/JuliaStream.jl/CUDA/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml new file mode 100644 index 0000000..bfc562f --- /dev/null +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -0,0 +1,547 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"] +git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.12" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "5e696e37e51b01ae07bd9f700afe6cbd55250bce" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.3.4" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.8" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "7.0.1" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" +version = "4.0.0+1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.7+0" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.42.0+0" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" +version = "4.0.0+1" + +[[ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" +version = "4.0.0+0" + +[[ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] +git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" +version = "4.0.0+1" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.12+0" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] +git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.34+0" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_xorgproto_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" +uuid = "c4d99508-4286-5418-9131-c86396af500b" +version = "2019.2.0+2" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "4.0.0+0" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "4.0.0+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml new file mode 100644 index 0000000..71715ff --- /dev/null +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml @@ -0,0 +1,11 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/Manifest.toml b/src/julia/JuliaStream.jl/Manifest.toml new file mode 100644 index 0000000..c096e05 --- /dev/null +++ b/src/julia/JuliaStream.jl/Manifest.toml @@ -0,0 +1,493 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.7" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.2.1" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.8" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "6.4.1" + +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.11.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.9.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.7" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.12.19358+0" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.1.0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.1" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.12" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+2" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "20.3.2+0" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "3.7.0+1" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "3.8.0+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.6712+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.1.3" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.1.2+1" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.3.6+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/julia/JuliaStream.jl/Project.toml b/src/julia/JuliaStream.jl/Project.toml new file mode 100644 index 0000000..9c7d49d --- /dev/null +++ b/src/julia/JuliaStream.jl/Project.toml @@ -0,0 +1,19 @@ +name = "JuliaStream" +uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681" +authors = ["Wei-Chen Lin "] +version = "3.4.0" + +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/README.md b/src/julia/JuliaStream.jl/README.md new file mode 100644 index 0000000..6204da7 --- /dev/null +++ b/src/julia/JuliaStream.jl/README.md @@ -0,0 +1,67 @@ +JuliaStream.jl +============== + +This is an implementation of BabelStream in Julia which contains the following variants: + + * `PlainStream.jl` - Single threaded `for` + * `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros + * `DistributedStream.jl` - Process based parallelism with `@distributed` macros + * `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) + * `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) + * `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) + * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) + +### Build & Run + +Prerequisites + + * Julia >= 1.6+ + +A set of reduced dependency projects are available for the following backend and implementations: + + * `AMDGPU` supports: + - `AMDGPUStream.jl` + * `CUDA` supports: + - `CUDAStream.jl` + * `oneAPI` supports: + - `oneAPIStream.jl` + * `KernelAbstractions` supports: + - `KernelAbstractionsStream.jl` + * `Threaded` supports: + - `PlainStream.jl` + - `ThreadedStream.jl` + - `DistributedStream.jl` + +With Julia on path, run your selected benchmark with: + +```shell +> cd JuliaStream.jl +> julia --project= -e 'import Pkg; Pkg.instantiate()' # only required on first run +> julia --project= src/Stream.jl +``` + +For example. to run the CUDA implementation: + +```shell +> cd JuliaStream.jl +> julia --project=CUDA -e 'import Pkg; Pkg.instantiate()' +> julia --project=CUDA src/CUDAStream.jl +``` + +**Important:** + * Julia is 1-indexed, so N >= 1 in `--device N`. + * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. + * Worker count for `DistributedStream` is set with `-p ` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). + * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. + +*** + +Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`. +There may be instances where some packages are locked to an older version because of transitive dependency requirements. + +To run the benchmark using the top-level project, run the benchmark with: +```shell +> cd JuliaStream.jl +> julia --project -e 'import Pkg; Pkg.instantiate()' +> julia --project src/Stream.jl +``` \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/Threaded/Manifest.toml b/src/julia/JuliaStream.jl/Threaded/Manifest.toml new file mode 100644 index 0000000..608e2da --- /dev/null +++ b/src/julia/JuliaStream.jl/Threaded/Manifest.toml @@ -0,0 +1,31 @@ +# This file is machine-generated - editing it directly is not advised + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" diff --git a/src/julia/JuliaStream.jl/Threaded/Project.toml b/src/julia/JuliaStream.jl/Threaded/Project.toml new file mode 100644 index 0000000..b65bdf5 --- /dev/null +++ b/src/julia/JuliaStream.jl/Threaded/Project.toml @@ -0,0 +1,6 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/oneAPI/Manifest.toml b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml new file mode 100644 index 0000000..82c40fd --- /dev/null +++ b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml @@ -0,0 +1,319 @@ +# This file is machine-generated - editing it directly is not advised + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.34.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.6" + +[[GPUArrays]] +deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "8.0.2" + +[[GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.12.9" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IrrationalConstants]] +git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.1.0" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "4.4.0" + +[[LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.10+0" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.0" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "2bfc354b5684821dcc88f1e477cefd0dd03c60b5" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.31.20514+0" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2021.2.0+0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.6.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "21.2.1+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "b30a895e7ea52991a3f984ab0302c42858d766c0" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.8173+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "92e8eefdd4694597994590230ab329545804bdb3" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.2.0" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "e1d123ff9ada6c469a1eaf57e33a74c3cb26a5a4" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.2.13+0" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "50124857f7e87420655929a9c8ca86749826af11" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.4.1+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/julia/JuliaStream.jl/oneAPI/Project.toml b/src/julia/JuliaStream.jl/oneAPI/Project.toml new file mode 100644 index 0000000..9f89f82 --- /dev/null +++ b/src/julia/JuliaStream.jl/oneAPI/Project.toml @@ -0,0 +1,7 @@ +[deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/src/julia/JuliaStream.jl/src/AMDGPUStream.jl b/src/julia/JuliaStream.jl/src/AMDGPUStream.jl new file mode 100644 index 0000000..4dd220c --- /dev/null +++ b/src/julia/JuliaStream.jl/src/AMDGPUStream.jl @@ -0,0 +1,167 @@ +# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs +include("Stream.jl") +using AMDGPU + +const ROCData = StreamData{T,ROCArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +function devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), "AMDGPU.jl"), sorted) + catch + # probably unsupported + String[] + end +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{ROCData{T},Nothing} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!() + # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand + AMDGPU.DEFAULT_AGENT[] = device[1] + selected = AMDGPU.get_default_agent() + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + println("Kernel parameters : <<<$(arraysize),$(TBSize)>>>") + end + return ( + ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T} + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) +end + +function copy!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds c[i] = a[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c) + ) +end + +function mul!(data::ROCData{T}, _) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds b[i] = scalar * c[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar) + ) +end + +function add!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds c[i] = a[i] + b[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c) + ) +end + +function triad!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function nstream!(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + @inbounds a[i] += b[i] + scalar * c[i] + return + end + AMDGPU.wait( + @roc groupsize = TBSize gridsize = data.size kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + ) +end + +function dot(data::ROCData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize)) + local_i = workitemIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1 + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here + end + + # then tree reduction + offset = workgroupDim().x ÷ 2 + while offset > 0 + sync_workgroup() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[workgroupIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = ROCArray{T}(undef, DotBlocks) + AMDGPU.wait( + @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + ) + return sum(partial_sum) +end + +function read_data(data::ROCData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/CUDAStream.jl b/src/julia/JuliaStream.jl/src/CUDAStream.jl new file mode 100644 index 0000000..da3698e --- /dev/null +++ b/src/julia/JuliaStream.jl/src/CUDAStream.jl @@ -0,0 +1,152 @@ +include("Stream.jl") +using CUDA + +const CuData = StreamData{T,CuArray{T}} where {T} +const TBSize = 1024::Int +const DotBlocks = 256::Int + +function devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices()) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{CuData{T},Nothing} where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + CUDA.device!(device[1]) + selected = CUDA.device() + # show_reason is set to true here so it dumps CUDA info + # for us regardless of whether it's functional + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>") + end + return ( + CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} + fill!(data.a, init[1]) + fill!(data.b, init[2]) + fill!(data.c, init[3]) +end + +function copy!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c) + CUDA.synchronize() +end + +function mul!(data::CuData{T}, _) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds b[i] = scalar * c[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar) + CUDA.synchronize() +end + +function add!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds c[i] = a[i] + b[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c) + CUDA.synchronize() +end + +function triad!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + CUDA.synchronize() +end + +function nstream!(data::CuData{T}, _) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) + CUDA.synchronize() +end + +function dot(data::CuData{T}, _) where {T} + # direct port of the reduction in CUDAStream.cu + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + tb_sum = @cuStaticSharedMem(T, TBSize) + local_i = threadIdx().x + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += blockDim().x * gridDim().x + end + + # then tree reduction + offset = blockDim().x ÷ 2 + while offset > 0 + sync_threads() + if (local_i - 1) < offset + @inbounds tb_sum[local_i] += tb_sum[local_i+offset] + end + offset ÷= 2 + end + + if (local_i == 1) + @inbounds partial[blockIdx().x] = tb_sum[local_i] + end + + return + end + partial_sum = CuArray{T}(undef, DotBlocks) + @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum) + return sum(partial_sum) +end + +function read_data(data::CuData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/DistributedStream.jl b/src/julia/JuliaStream.jl/src/DistributedStream.jl new file mode 100644 index 0000000..2e80168 --- /dev/null +++ b/src/julia/JuliaStream.jl/src/DistributedStream.jl @@ -0,0 +1,85 @@ +using Distributed + +@everywhere using Pkg +@everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...` +@everywhere include("StreamData.jl") +@everywhere include("Stream.jl") +@everywhere using SharedArrays +@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} + +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{SharedArrayData{T},Nothing} where {T} + + if !silent + println("Using max $(nworkers()) process(es) + 1 master") + end + return ( + SharedArrayData{T}( + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + SharedArray{T}(arraysize), + scalar, + arraysize, + ), + nothing, + ) +end + +function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T} + + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::SharedArrayData{T}, _) where {T} + @sync @distributed for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::SharedArrayData{T}, _) where {T} + return @distributed (+) for i = 1:data.size + @inbounds data.a[i] * data.b[i] + end +end + +function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/JuliaStream.jl b/src/julia/JuliaStream.jl/src/JuliaStream.jl new file mode 100644 index 0000000..e01d46d --- /dev/null +++ b/src/julia/JuliaStream.jl/src/JuliaStream.jl @@ -0,0 +1,4 @@ +module JuliaStream +end + +println("Please run benchmarks directly via `julia --project src/Stream.jl`") \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/KernelAbstractionsStream.jl b/src/julia/JuliaStream.jl/src/KernelAbstractionsStream.jl new file mode 100644 index 0000000..2b9d9ad --- /dev/null +++ b/src/julia/JuliaStream.jl/src/KernelAbstractionsStream.jl @@ -0,0 +1,255 @@ +using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU +include("Stream.jl") + +const CuData = StreamData{T,CUDA.CuArray{T}} where {T} +const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T} + +const TBSize = 1024::Int +const DotBlocks = 256::Int + +@enum Backend cuda rocm cpu + +struct Context + backend::Backend + device::Device +end + +function list_rocm_devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), rocm), sorted) + catch + # probably unsupported + [] + end +end + +function list_cuda_devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? String[] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices()) +end + +function devices()::Vector{DeviceWithRepr} + cudas = list_cuda_devices() + rocms = list_rocm_devices() + cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)] + vcat(cpus, cudas, rocms) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +) where {T} + + if arraysize % TBSize != 0 + error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") + end + + (selected, _, backend) = device + if backend == cpu + if !silent + println("Using CPU with max $(Threads.nthreads()) threads") + end + partialsum = Vector{T}(undef, DotBlocks) + data = VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CPU() + elseif backend == cuda + CUDA.device!(selected) + if CUDA.device() != selected + error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())") + end + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + if !silent + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + end + partialsum = CuArray{T}(undef, DotBlocks) + data = CuData{T}( + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + CuArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = CUDADevice() + elseif backend == rocm + AMDGPU.DEFAULT_AGENT[] = selected + if AMDGPU.get_default_agent() != selected + error( + "Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())", + ) + end + if !silent + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + end + partialsum = ROCArray{T}(undef, DotBlocks) + data = ROCData{T}( + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + ROCArray{T}(undef, arraysize), + scalar, + arraysize, + ) + backenddevice = ROCDevice() + else + error("unsupported backend $(backend)") + end + + if !silent + println("Kernel parameters : <<<$(data.size),$(TBSize)>>>") + end + return (data, Context(backend, backenddevice)) +end + +function init_arrays!( + data::StreamData{T,C}, + context::Context, + init::Tuple{T,T,T}, +) where {T,C} + if context.backend == cpu + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end + elseif context.backend == cuda + CUDA.fill!(data.a, init[1]) + CUDA.fill!(data.b, init[2]) + CUDA.fill!(data.c, init[3]) + elseif context.backend == rocm + AMDGPU.fill!(data.a, init[1]) + AMDGPU.fill!(data.b, init[2]) + AMDGPU.fill!(data.c, init[3]) + else + error("unsupported backend $(backend)") + end +end + +function copy!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), c) + i = @index(Global) + @inbounds c[i] = a[i] + end + wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size)) +end + +function mul!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T) + i = @index(Global) + @inbounds b[i] = scalar * c[i] + end + wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size)) +end + +function add!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c) + i = @index(Global) + @inbounds c[i] = a[i] + b[i] + end + wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size)) +end + +function triad!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) + i = @index(Global) + @inbounds a[i] = b[i] + (scalar * c[i]) + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function nstream!(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T) + i = @index(Global) + @inbounds a[i] += b[i] + scalar * c[i] + end + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.c, + data.scalar, + ndrange = data.size, + ), + ) +end + +function dot(data::StreamData{T,C}, context::Context) where {T,C} + @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T}) + local_i = @index(Local) + group_i = @index(Group) + tb_sum = @localmem T TBSize + @inbounds tb_sum[local_i] = 0.0 + + # do dot first + i = @index(Global) + while i <= size + @inbounds tb_sum[local_i] += a[i] * b[i] + i += TBSize * DotBlocks + end + + # then tree reduction + # FIXME this does not compile when targeting CPUs: + # see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262 + offset = @private Int64 (1,) + @inbounds begin + offset[1] = @groupsize()[1] ÷ 2 + while offset[1] > 0 + @synchronize + if (local_i - 1) < offset[1] + tb_sum[local_i] += tb_sum[local_i+offset[1]] + end + offset[1] ÷= 2 + end + end + + if (local_i == 1) + @inbounds partial[group_i] = tb_sum[local_i] + end + end + + if context.backend == cpu + partial_sum = Vector{T}(undef, DotBlocks) + elseif context.backend == cuda + partial_sum = CuArray{T}(undef, DotBlocks) + elseif context.backend == rocm + partial_sum = ROCArray{T}(undef, DotBlocks) + else + error("unsupported backend $(backend)") + end + + wait( + kernel(context.device, TBSize)( + data.a, + data.b, + data.size, + partial_sum, + ndrange = TBSize * DotBlocks, + ), + ) + + return sum(partial_sum) +end + +function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() diff --git a/src/julia/JuliaStream.jl/src/PlainStream.jl b/src/julia/JuliaStream.jl/src/PlainStream.jl new file mode 100644 index 0000000..654d6eb --- /dev/null +++ b/src/julia/JuliaStream.jl/src/PlainStream.jl @@ -0,0 +1,75 @@ +include("Stream.jl") + +function devices()::Vector{DeviceWithRepr} + return [(undef, "CPU", "Palin")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{VectorData{T},Nothing} where {T} + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) +end + +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} + for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}, _) where {T} + for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +function dot(data::VectorData{T}, _) where {T} + sum = zero(T) + for i = 1:data.size + @inbounds sum += data.a[i] * data.b[i] + end + return sum +end + +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/Stream.jl b/src/julia/JuliaStream.jl/src/Stream.jl new file mode 100644 index 0000000..1905c81 --- /dev/null +++ b/src/julia/JuliaStream.jl/src/Stream.jl @@ -0,0 +1,300 @@ +using ArgParse +using Parameters +using Printf +using Base: Float64, Int + +include("StreamData.jl") + +const VectorData = StreamData{T,Vector{T}} where {T} + +const DeviceWithRepr = Tuple{Any,String,Any} + +struct Timings + copy::Vector{Float64} + mul::Vector{Float64} + add::Vector{Float64} + triad::Vector{Float64} + dot::Vector{Float64} + Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n)) +end + +@enum Benchmark All Triad Nstream + +function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} + timings = Timings(times) + lastSum::T = 0 + for i = 1:times + @inbounds timings.copy[i] = @elapsed copy!(data, context) + @inbounds timings.mul[i] = @elapsed mul!(data, context) + @inbounds timings.add[i] = @elapsed add!(data, context) + @inbounds timings.triad[i] = @elapsed triad!(data, context) + @inbounds timings.dot[i] = @elapsed lastSum = dot(data, context) + end + return (timings, lastSum) +end + +function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {T,C} + return @elapsed for _ = 1:times + triad!(data, context) + end +end + +function run_nstream!( + data::StreamData{T,C}, + context, + times::Int, +)::Vector{Float64} where {T,C} + timings::Vector{Float64} = zeros(times) + for i = 1:times + @inbounds timings[i] = @elapsed nstream!(data, context) + end + return timings +end + +function check_solutions( + data::StreamData{T,C}, + times::Int, + init::Tuple{T,T,T}, + benchmark::Benchmark, + dot::Union{T,Nothing}, +) where {T,C} + (gold_a, gold_b, gold_c) = init + for _ = 1:times + if benchmark == All + gold_c = gold_a + gold_b = data.scalar * gold_c + gold_c = gold_a + gold_b + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Triad + gold_a = gold_b + data.scalar * gold_c + elseif benchmark == Nstream + gold_a += gold_b + data.scalar * gold_c + else + error("Unknown benchmark", benchmark) + end + end + + tolerance = eps(T) * 100 + function validate_xs(name::String, xs::AbstractArray{T}, from::T) + error = (map(x -> abs(x - from), xs) |> sum) / length(xs) + failed = error > tolerance + if failed + println("Validation failed on $name. Average error $error") + end + !failed + end + a_valid = validate_xs("a", data.a, gold_a) + b_valid = validate_xs("b", data.b, gold_b) + c_valid = validate_xs("c", data.c, gold_c) + dot_valid = + dot !== nothing ? + begin + gold_sum = gold_a * gold_b * data.size + error = abs((dot - gold_sum) / gold_sum) + failed = error > 1.0e-8 + if failed + println( + "Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum", + ) + end + !failed + end : true + + a_valid && b_valid && c_valid && dot_valid +end + +@with_kw mutable struct Config + list::Bool = false + device::Int = 1 + numtimes::Int = 100 + arraysize::Int = 33554432 + float::Bool = false + triad_only::Bool = false + nstream_only::Bool = false + csv::Bool = false + mibibytes::Bool = false +end + +function parse_options(given::Config) + s = ArgParseSettings() + @add_arg_table s begin + "--list" + help = "List available devices" + action = :store_true + "--device", "-d" + help = "Select device at DEVICE, NOTE: Julia is 1-indexed" + arg_type = Int + default = given.device + "--numtimes", "-n" + help = "Run the test NUMTIMES times (NUM >= 2)" + arg_type = Int + default = given.numtimes + "--arraysize", "-s" + help = "Use ARRAYSIZE elements in the array" + arg_type = Int + default = given.arraysize + "--float" + help = "Use floats (rather than doubles)" + action = :store_true + "--triad_only" + help = "Only run triad" + action = :store_true + "--nstream_only" + help = "Only run nstream" + action = :store_true + "--csv" + help = "Output as csv table" + action = :store_true + "--mibibytes" + help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + action = :store_true + end + args = parse_args(s) + # surely there's a better way than doing this: + for (arg, val) in args + setproperty!(given, Symbol(arg), val) + end +end + +const DefaultInit = (0.1, 0.2, 0.0) +const DefaultScalar = 0.4 +const Version = "3.4.0" + +function main() + + config::Config = Config() + parse_options(config) + + if config.list + for (i, (_,repr, impl)) in enumerate(devices()) + println("[$i] ($impl) $repr") + end + exit(0) + end + + ds = devices() + # TODO implement substring device match + if config.device < 1 || config.device > length(ds) + error( + "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", + ) + else + device = ds[config.device] + end + + type = config.float ? Float32 : Float64 + + if config.nstream_only && !config.triad_only + benchmark = Nstream + elseif !config.nstream_only && config.triad_only + benchmark = Triad + elseif !config.nstream_only && !config.triad_only + benchmark = All + elseif config.nstream_only && config.triad_only + error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks") + else + error("Invalid config: $(repr(config))") + end + + array_bytes = config.arraysize * sizeof(type) + total_bytes = array_bytes * 3 + (mega_scale, mega_suffix, giga_scale, giga_suffix) = + !config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB") + + if !config.csv + println("""BabelStream + Version: $Version + Implementation: Julia; $(PROGRAM_FILE)""") + println("Running kernels $(config.numtimes) times") + if benchmark == Triad + println("Number of elements: $(config.arraysize)") + end + println("Precision: $(config.float ? "float" : "double")") + r1 = n -> round(n; digits = 1) + println( + "Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)", + ) + println( + "Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)", + ) + end + + function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) + tail = Base.rest(xs) + min = Iterators.minimum(tail) + max = Iterators.maximum(tail) + avg = Iterators.sum(tail) / Iterators.length(tail) + mbps = mega_scale * total_bytes / min + if config.csv + return [ + ("function", name), + ("num_times", config.numtimes), + ("n_elements", config.arraysize), + ("sizeof", total_bytes), + ("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps), + ("min_runtime", min), + ("max_runtime", max), + ("avg_runtime", avg), + ] + else + return [ + ("Function", name), + ("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)), + ("Min (sec)", round(min; digits = 5)), + ("Max", round(max; digits = 5)), + ("Average", round(avg; digits = 5)), + ] + end + end + + function tabulate(rows::Vector{Tuple{String,Any}}...) + header = Base.first(rows) + padding = config.csv ? 0 : 12 + sep = config.csv ? "," : "" + map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println + for row in rows + map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println + end + end + + init::Tuple{type,type,type} = DefaultInit + scalar::type = DefaultScalar + + GC.enable(false) + + (data, context) = make_stream(config.arraysize, scalar, device, config.csv) + init_arrays!(data, context, init) + if benchmark == All + (timings, sum) = run_all!(data, context, config.numtimes) + valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) + tabulate( + mk_row(timings.copy, "Copy", 2 * array_bytes), + mk_row(timings.mul, "Mul", 2 * array_bytes), + mk_row(timings.add, "Add", 3 * array_bytes), + mk_row(timings.triad, "Triad", 3 * array_bytes), + mk_row(timings.dot, "Dot", 2 * array_bytes), + ) + elseif benchmark == Nstream + timings = run_nstream!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) + elseif benchmark == Triad + elapsed = run_triad!(data, context, config.numtimes) + valid = + check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + total_bytes = 3 * array_bytes * config.numtimes + bandwidth = mega_scale * (total_bytes / elapsed) + println("Runtime (seconds): $(round(elapsed; digits=5))") + println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ") + else + error("Bad benchmark $(benchmark)") + end + + GC.enable(true) + + if !valid + exit(1) + end + +end diff --git a/src/julia/JuliaStream.jl/src/StreamData.jl b/src/julia/JuliaStream.jl/src/StreamData.jl new file mode 100644 index 0000000..55e055a --- /dev/null +++ b/src/julia/JuliaStream.jl/src/StreamData.jl @@ -0,0 +1,7 @@ +struct StreamData{T,C<:AbstractArray{T}} + a::C + b::C + c::C + scalar::T + size::Int +end diff --git a/src/julia/JuliaStream.jl/src/ThreadedStream.jl b/src/julia/JuliaStream.jl/src/ThreadedStream.jl new file mode 100644 index 0000000..f282fda --- /dev/null +++ b/src/julia/JuliaStream.jl/src/ThreadedStream.jl @@ -0,0 +1,112 @@ +include("Stream.jl") + +function devices()::Vector{DeviceWithRepr} + return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")] +end + +function make_stream( + arraysize::Int, + scalar::T, + _::DeviceWithRepr, + silent::Bool, +)::Tuple{VectorData{T},Nothing} where {T} + if !silent + println("Using max $(Threads.nthreads()) threads") + end + return ( + VectorData{T}( + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + Vector{T}(undef, arraysize), + scalar, + arraysize, + ), + nothing + ) +end + +function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = init[1] + @inbounds data.b[i] = init[2] + @inbounds data.c[i] = init[3] + end +end + +function copy!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + end +end + +function mul!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.b[i] = data.scalar * data.c[i] + end +end + +function add!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.c[i] = data.a[i] + data.b[i] + end +end + +function triad!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) + end +end + +function nstream!(data::VectorData{T}, _) where {T} + Threads.@threads for i = 1:data.size + @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] + end +end + +# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc +function static_par_ranged(f::Function, range::Int, n::Int) + stride = range ÷ n + rem = range % n + strides = map(0:n) do i + width = stride + (i < rem ? 1 : 0) + offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem)) + (offset, width) + end + ccall(:jl_enter_threaded_region, Cvoid, ()) + try + foreach(wait, map(1:n) do group + (offset, size) = strides[group] + task = Task(() -> f(group, offset+1, offset+size)) + task.sticky = true + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group + schedule(task) + end) + finally + ccall(:jl_exit_threaded_region, Cvoid, ()) + end +end + +function dot(data::VectorData{T}, _) where {T} + partial = Vector{T}(undef, Threads.nthreads()) + static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx + acc = zero(T) + @simd for i = startidx:endidx + @inbounds acc += data.a[i] * data.b[i] + end + @inbounds partial[group] = acc + end + return sum(partial) + # This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall + # and inhibited vectorisation from the lack of @simd + # partial = zeros(T, Threads.nthreads()) + # Threads.@threads for i = 1:data.size + # @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i]) + # end + # return sum(partial) +end + +function read_data(data::VectorData{T}, _)::VectorData{T} where {T} + return data +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/src/oneAPIStream.jl b/src/julia/JuliaStream.jl/src/oneAPIStream.jl new file mode 100644 index 0000000..83f100e --- /dev/null +++ b/src/julia/JuliaStream.jl/src/oneAPIStream.jl @@ -0,0 +1,170 @@ +using Base.Iterators: println +using Base.Iterators: println +using Printf: Iterators + +include("Stream.jl") +using oneAPI + +const oneData = StreamData{T,oneArray{T}} where {T} +const DotWGSize = 256::Int + +function devices()::Vector{DeviceWithRepr} + all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect + map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all) +end + +function make_stream( + arraysize::Int, + scalar::T, + device::DeviceWithRepr, + silent::Bool, +)::Tuple{oneData{T},Int} where {T} + + oneAPI.allowscalar(false) + oneAPI.device!(device[1]) + + props = oneL0.compute_properties(oneAPI.device()) + groupsize = min(props.maxTotalGroupSize, arraysize) + + if arraysize % groupsize != 0 + error("arraysize ($(arraysize)) must be divisible by $(groupsize)!") + end + + if !silent + println("Using L0 device: $(repr("text/plain",device[1]))") + println("Kernel parameters : <<<$(arraysize),$(groupsize)>>>") + end + return ( + oneData{T}( + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + oneArray{T}(undef, arraysize), + scalar, + arraysize, + ), + groupsize, + ) +end + +function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T} + oneAPI.fill!(data.a, init[1]) + oneAPI.fill!(data.b, init[2]) + oneAPI.fill!(data.c, init[3]) +end + +function copy!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, c::AbstractArray{T}) + i = get_global_id() + @inbounds c[i] = a[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.c, + ) + oneAPI.synchronize() +end + +function mul!(data::oneData{T}, groupsize::Int) where {T} + function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds b[i] = scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function add!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}) + i = get_global_id() + @inbounds c[i] = a[i] + b[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + ) + oneAPI.synchronize() +end + +function triad!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds a[i] = b[i] + (scalar * c[i]) + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function nstream!(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T) + i = get_global_id() + @inbounds a[i] += b[i] + scalar * c[i] + return + end + @oneapi items = groupsize groups = data.size ÷ groupsize kernel( # + data.a, + data.b, + data.c, + data.scalar, + ) + oneAPI.synchronize() +end + +function dot(data::oneData{T}, groupsize::Int) where {T} + function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T}) + wg_sum = @LocalMemory(T, (DotWGSize,)) + li = get_local_id() + @inbounds wg_sum[li] = 0.0 + + # do dot first + i = get_global_id() + while i <= size + @inbounds wg_sum[li] += a[i] * b[i] + i += get_global_size() + end + + # then tree reduction + offset = get_local_size() ÷ 2 + while offset > 0 + barrier() + if li <= offset + @inbounds wg_sum[li] += wg_sum[li+offset] + end + offset ÷= 2 + end + + if li == 1 + @inbounds partial[get_group_id()] = wg_sum[li] + end + + return + end + partial_sum = oneArray{T}(undef, groupsize) + @oneapi items = groupsize groups = DotWGSize kernel( + data.a, + data.b, + data.size, + partial_sum, + ) + oneAPI.synchronize() + return sum(partial_sum) +end + +function read_data(data::oneData{T}, _)::VectorData{T} where {T} + return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) +end + +main() \ No newline at end of file diff --git a/src/julia/JuliaStream.jl/update_all.sh b/src/julia/JuliaStream.jl/update_all.sh new file mode 100755 index 0000000..ad6c2ee --- /dev/null +++ b/src/julia/JuliaStream.jl/update_all.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# shellcheck disable=SC2034 disable=SC2153 + +for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" +do + julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' +done \ No newline at end of file diff --git a/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp similarity index 100% rename from KokkosStream.cpp rename to src/kokkos/KokkosStream.cpp diff --git a/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp similarity index 100% rename from KokkosStream.hpp rename to src/kokkos/KokkosStream.hpp diff --git a/KOKKOS.cmake b/src/kokkos/model.cmake similarity index 100% rename from KOKKOS.cmake rename to src/kokkos/model.cmake diff --git a/legacy/HCStream.cpp b/src/legacy/HCStream.cpp similarity index 100% rename from legacy/HCStream.cpp rename to src/legacy/HCStream.cpp diff --git a/legacy/HCStream.h b/src/legacy/HCStream.h similarity index 100% rename from legacy/HCStream.h rename to src/legacy/HCStream.h diff --git a/main.cpp b/src/main.cpp similarity index 98% rename from main.cpp rename to src/main.cpp index de301ce..5a01b74 100644 --- a/main.cpp +++ b/src/main.cpp @@ -27,6 +27,8 @@ #include "STD20Stream.hpp" #elif defined(TBB) #include "TBBStream.hpp" +#elif defined(THRUST) +#include "ThrustStream.h" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -41,6 +43,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" +#elif defined(SYCL2020) +#include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" #endif @@ -272,11 +276,15 @@ void run() // Use the C++20 implementation stream = new TBBStream(ARRAY_SIZE, deviceIndex); +#elif defined(THRUST) + // Use the Thrust implementation + stream = new ThrustStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex); -#elif defined(SYCL) +#elif defined(SYCL) || defined(SYCL2020) // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); diff --git a/CL/cl2.hpp b/src/ocl/CL/cl2.hpp similarity index 100% rename from CL/cl2.hpp rename to src/ocl/CL/cl2.hpp diff --git a/OCLStream.cpp b/src/ocl/OCLStream.cpp similarity index 100% rename from OCLStream.cpp rename to src/ocl/OCLStream.cpp diff --git a/OCLStream.h b/src/ocl/OCLStream.h similarity index 100% rename from OCLStream.h rename to src/ocl/OCLStream.h diff --git a/OCL.cmake b/src/ocl/model.cmake similarity index 74% rename from OCL.cmake rename to src/ocl/model.cmake index 2be3981..acefe71 100644 --- a/OCL.cmake +++ b/src/ocl/model.cmake @@ -9,8 +9,7 @@ register_flag_optional(OpenCL_LIBRARY macro(setup) - # don't point to the CL dir as the imports already have the CL prefix - set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") + setup_opencl_header_includes() find_package(OpenCL REQUIRED) register_link_library(OpenCL::OpenCL) endmacro() diff --git a/OMPStream.cpp b/src/omp/OMPStream.cpp similarity index 100% rename from OMPStream.cpp rename to src/omp/OMPStream.cpp diff --git a/OMPStream.h b/src/omp/OMPStream.h similarity index 100% rename from OMPStream.h rename to src/omp/OMPStream.h diff --git a/OMP.cmake b/src/omp/model.cmake similarity index 100% rename from OMP.cmake rename to src/omp/model.cmake diff --git a/RAJAStream.cpp b/src/raja/RAJAStream.cpp similarity index 100% rename from RAJAStream.cpp rename to src/raja/RAJAStream.cpp diff --git a/RAJAStream.hpp b/src/raja/RAJAStream.hpp similarity index 100% rename from RAJAStream.hpp rename to src/raja/RAJAStream.hpp diff --git a/RAJA.cmake b/src/raja/model.cmake similarity index 100% rename from RAJA.cmake rename to src/raja/model.cmake diff --git a/src/rust/rust-stream/.cargo/config.toml b/src/rust/rust-stream/.cargo/config.toml new file mode 100644 index 0000000..d5135e9 --- /dev/null +++ b/src/rust/rust-stream/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["-C", "target-cpu=native"] \ No newline at end of file diff --git a/src/rust/rust-stream/.gitignore b/src/rust/rust-stream/.gitignore new file mode 100644 index 0000000..3a8cabc --- /dev/null +++ b/src/rust/rust-stream/.gitignore @@ -0,0 +1,2 @@ +/target +.idea diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock new file mode 100644 index 0000000..5f225f0 --- /dev/null +++ b/src/rust/rust-stream/Cargo.lock @@ -0,0 +1,636 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "colour" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a" +dependencies = [ + "crossterm", +] + +[[package]] +name = "core_affinity" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8a03115cc34fb0d7c321dd154a3914b3ca082ccc5c11d91bf7117dbbe7171f" +dependencies = [ + "kernel32-sys", + "libc", + "num_cpus", + "winapi 0.2.8", +] + +[[package]] +name = "crossbeam" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crossterm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +dependencies = [ + "bitflags", + "crossterm_winapi", + "lazy_static", + "libc", + "mio", + "parking_lot", + "signal-hook", + "winapi 0.3.9", +] + +[[package]] +name = "crossterm_winapi" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" + +[[package]] +name = "lock_api" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mio" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "winapi 0.3.9", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi 0.3.9", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rstest" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "rust-stream" +version = "3.4.0" +dependencies = [ + "colour", + "core_affinity", + "crossbeam", + "libc", + "num-traits", + "num_cpus", + "rayon", + "rstest", + "rustversion", + "structopt", + "tabular", +] + +[[package]] +name = "rustc_version" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "semver" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +dependencies = [ + "pest", +] + +[[package]] +name = "signal-hook" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +dependencies = [ + "libc", + "mio", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "smallvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "structopt" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "tabular" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml new file mode 100644 index 0000000..f0365a6 --- /dev/null +++ b/src/rust/rust-stream/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "rust-stream" +version = "3.4.0" +authors = ["Wei-Chen Lin "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +num-traits = "0.2.14" +structopt = "0.3.13" +tabular = "0.1.4" +rayon = "1.5.1" +crossbeam = "0.8.1" +num_cpus = "1.13.0" +rustversion = "1.0" +libc = "0.2.97" +core_affinity = "0.5.10" +colour = "0.6.0" + +[dev-dependencies] +rstest = "0.10.0" + +[build-dependencies] +rustversion = "1.0" + +[profile.dev] +opt-level = 2 +overflow-checks = true + + +[profile.release] +opt-level = 3 +lto = "thin" # fully enabling this (i.e true) negatively affects performance as tested on both AMD and Intel diff --git a/src/rust/rust-stream/README.md b/src/rust/rust-stream/README.md new file mode 100644 index 0000000..6696de5 --- /dev/null +++ b/src/rust/rust-stream/README.md @@ -0,0 +1,78 @@ +rust-stream +=========== + +This is an implementation of BabelStream in Rust. + +Currently, we support three CPU threading API as devices: + +* Plain - basic single-threaded `for` version, see [plain_stream.rs](src/plain_stream.rs) +* [Rayon](https://github.com/rayon-rs/rayon) - Parallel with high level API, + see [rayon_stream.rs](src/rayon_stream.rs) +* [Crossbeam](https://github.com/crossbeam-rs/crossbeam) - Parallel with partitions per thread, + see [crossbeam_stream.rs](src/crossbeam_stream.rs) +* Arc - Parallel with `Vec` per thread (static partitions) wrapped in `Mutex` contained in `Arc`s, + see [crossbeam_stream.rs](src/arc_stream.rs) +* Unsafe - Parallel with unsafe pointer per thread (static partitions) to `Vec`, + see [crossbeam_stream.rs](src/unsafe_stream.rs) + +In addition, this implementation also supports the following extra flags: +**** +``` +--init Initialise each benchmark array at allocation time on the main thread +--malloc Use libc malloc instead of the Rust's allocator for benchmark array allocation +--pin Pin threads to distinct cores, this has NO effect in Rayon devices +``` + +Max thread count is controlled by the environment variable `BABELSTREAM_NUM_THREADS` which is compatible for all devices (avoid setting `RAYON_NUM_THREADS`, the implementation will issue a warning if this happened). + +There is an ongoing investigation on potential performance issues under NUMA situations. As part of +the experiment, this implementation made use of the +provisional [Allocator traits](https://github.com/rust-lang/rust/issues/32838) which requires rust +unstable. We hope a NUMA aware allocator will be available once the allocator API reaches rust +stable. + +### Build & Run + +Prerequisites: + +* [Rust toolchain](https://www.rust-lang.org/tools/install) + +Once the toolchain is installed, enable the nightly channel: + +```shell +> rustup install nightly +> rustup default nightly # optional, this sets `+nightly` automatically for cargo calls later +``` + +With `cargo` on path, compile and run the benchmark with: + +```shell +> cd rust-stream/ +> cargo +nightly build --release # or simply `cargo build --release` if nightly channel is the default +> ./target/release/rust-stream --help +rust-stream 3.4.0 + +USAGE: + rust-stream [FLAGS] [OPTIONS] + +FLAGS: + --csv Output as csv table + --float Use floats (rather than doubles) + -h, --help Prints help information + --init Initialise each benchmark array at allocation time on the main thread + --list List available devices + --malloc Use libc malloc instead of the Rust's allocator for benchmark array allocation + --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6) + --nstream-only Only run nstream + --pin Pin threads to distinct cores, this has NO effect in Rayon devices + --triad-only Only run triad + -V, --version Prints version information + +OPTIONS: + -s, --arraysize Use elements in the array [default: 33554432] + --device Select device at [default: 0] + -n, --numtimes Run the test times (NUM >= 2) [default: 100] +``` + + + \ No newline at end of file diff --git a/src/rust/rust-stream/rustfmt.toml b/src/rust/rust-stream/rustfmt.toml new file mode 100644 index 0000000..aa2f0e9 --- /dev/null +++ b/src/rust/rust-stream/rustfmt.toml @@ -0,0 +1,68 @@ +max_width = 100 +hard_tabs = false +tab_spaces = 2 +newline_style = "Auto" +use_small_heuristics = "Max" +indent_style = "Block" +wrap_comments = false +format_code_in_doc_comments = false +comment_width = 80 +normalize_comments = false +normalize_doc_attributes = false +license_template_path = "" +format_strings = false +format_macro_matchers = false +format_macro_bodies = true +empty_item_single_line = true +struct_lit_single_line = true +fn_single_line = true +where_single_line = true +imports_indent = "Block" +imports_layout = "Mixed" +imports_granularity = "Preserve" +group_imports = "Preserve" +reorder_imports = true +reorder_modules = true +reorder_impl_items = false +type_punctuation_density = "Wide" +space_before_colon = false +space_after_colon = true +spaces_around_ranges = false +binop_separator = "Front" +remove_nested_parens = true +combine_control_expr = true +overflow_delimited_expr = false +struct_field_align_threshold = 0 +enum_discrim_align_threshold = 0 +match_arm_blocks = true +match_arm_leading_pipes = "Never" +force_multiline_blocks = false +fn_args_layout = "Compressed" +brace_style = "PreferSameLine" +control_brace_style = "AlwaysSameLine" +trailing_semicolon = true +trailing_comma = "Vertical" +match_block_trailing_comma = false +blank_lines_upper_bound = 1 +blank_lines_lower_bound = 0 +edition = "2015" +version = "One" +inline_attribute_width = 0 +merge_derives = true +use_try_shorthand = false +use_field_init_shorthand = false +force_explicit_abi = true +condense_wildcard_suffixes = false +color = "Auto" +required_version = "1.4.38" +unstable_features = false +disable_all_formatting = false +skip_children = false +hide_parse_errors = false +error_on_line_overflow = false +error_on_unformatted = false +report_todo = "Never" +report_fixme = "Never" +ignore = [] +emit_mode = "Files" +make_backup = false diff --git a/src/rust/rust-stream/src/arc_stream.rs b/src/rust/rust-stream/src/arc_stream.rs new file mode 100644 index 0000000..006f73a --- /dev/null +++ b/src/rust/rust-stream/src/arc_stream.rs @@ -0,0 +1,254 @@ +use std::iter::Sum; +use std::sync::{Arc, Mutex}; + +use self::core_affinity::CoreId; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +struct ArcHeapData { + a_chunks: Vec>>>, + b_chunks: Vec>>>, + c_chunks: Vec>>>, +} + +pub struct ArcDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, + data: ArcHeapData, +} + +impl ArcDevice { + pub fn new(ncore: usize, pin: bool, alloc: A) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + + let lift = + || (0..ncore).map(|_| return Arc::new(Mutex::new(Vec::new_in(alloc)))).collect::>(); + let data = ArcHeapData { a_chunks: lift(), b_chunks: lift(), c_chunks: lift() }; + + ArcDevice { ncore, pin, core_ids, data } + } + + pub fn ref_a(&self, t: usize) -> Arc>> { self.data.a_chunks[t].clone() } + + pub fn ref_b(&self, t: usize) -> Arc>> { self.data.b_chunks[t].clone() } + + pub fn ref_c(&self, t: usize) -> Arc>> { self.data.c_chunks[t].clone() } + + // divide the length by the number of cores, the last core gets less work if it does not divide + fn chunk_size(&self, len: usize, t: usize) -> usize { + assert!(t < self.ncore); + let chunk = (len as f64 / self.ncore as f64).ceil() as usize; + if t == self.ncore - 1 { + len - (t * chunk) + } else { + chunk + } + } +} + +extern crate core_affinity; + +// Arc+Mutex threaded version, it should be semantically equal to the single threaded version +impl + RustStream for StreamData, A> +{ + fn init_arrays(&mut self) { + let init = self.init; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + ref_a.lock().unwrap().resize(n, init.0); + ref_b.lock().unwrap().resize(n, init.1); + ref_c.lock().unwrap().resize(n, init.2); + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + fn read_arrays(&mut self) { + let range = self.size; + let unlift = |drain: &mut Vec, source: &Vec>>>| { + let xs = + source.into_iter().flat_map(|x| x.lock().unwrap().clone().into_iter()).collect::>(); + for i in 0..range { + drain[i] = xs[i]; + } + }; + unlift(&mut self.a, &self.device.data.a_chunks); + unlift(&mut self.b, &self.device.data.b_chunks); + unlift(&mut self.c, &self.device.data.c_chunks); + } + + fn copy(&mut self) { + let pin = self.device.pin; + (0..self.device.ncore) + .map(move |t| { + let ref_a = self.device.ref_a(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let mut c = ref_c.lock().unwrap(); + for i in 0..n { + c[i] = a[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(move |t| { + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + b[i] = scalar * c[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn add(&mut self) { + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let mut c = ref_c.lock().unwrap(); + for i in 0..n { + c[i] = a[i] + b[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn triad(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + a[i] = b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + a[i] += b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn dot(&mut self) -> T { + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let mut p = T::default(); + for i in 0..n { + p += a[i] * b[i]; + } + p + }) + }) + .collect::>() + .into_iter() + .map(|t| t.join().unwrap()) + .sum() + } +} diff --git a/src/rust/rust-stream/src/crossbeam_stream.rs b/src/rust/rust-stream/src/crossbeam_stream.rs new file mode 100644 index 0000000..44358ae --- /dev/null +++ b/src/rust/rust-stream/src/crossbeam_stream.rs @@ -0,0 +1,221 @@ +use std::iter::Sum; +use std::slice::{Chunks, ChunksMut}; + +use crossbeam::thread; + +use self::core_affinity::CoreId; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +pub struct CrossbeamDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, +} + +impl CrossbeamDevice { + pub fn new(ncore: usize, pin: bool) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + CrossbeamDevice { ncore, pin, core_ids } + } +} + +impl CrossbeamDevice { + // divide the length by the number of cores, the last core gets less work if it does not divide + fn chunk_size(&self, len: usize) -> usize { (len as f64 / self.ncore as f64).ceil() as usize } + + // make a mutable chunk from the vec + fn mk_mut_chunks<'a, T, A: AllocatorType>(&self, xs: &'a mut Vec) -> ChunksMut<'a, T> { + let len = xs.len(); + xs.chunks_mut(self.chunk_size(len)) + } + + // make a immutable chunk from the vec + fn mk_chunks<'a, T, A: AllocatorType>(&self, xs: &'a mut Vec) -> Chunks<'a, T> { + xs.chunks(self.chunk_size(xs.len())) + } +} + +extern crate core_affinity; + +// Crossbeam threaded version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData +{ + fn init_arrays(&mut self) { + thread::scope(|s| { + let init = self.init; + let pin = self.device.pin; + for (t, ((a, b), c)) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_mut_chunks(&mut self.b)) + .zip(self.device.mk_mut_chunks(&mut self.c)), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for x in a.into_iter() { + *x = init.0; + } + for x in b.into_iter() { + *x = init.1; + } + for x in c.into_iter() { + *x = init.2; + } + }); + } + }) + .unwrap() + } + + fn copy(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + for (t, (c, a)) in self + .device + .core_ids + .iter() + .zip(self.device.mk_mut_chunks(&mut self.c).zip(self.device.mk_chunks(&mut self.a))) + { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..c.len() { + c[i] = a[i]; + } + }); + } + }) + .unwrap() + } + + fn mul(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (b, c)) in self + .device + .core_ids + .iter() + .zip(self.device.mk_mut_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))) + { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..b.len() { + b[i] = scalar * c[i]; + } + }); + } + }) + .unwrap() + } + + fn add(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + for (t, (c, (a, b))) in (&mut self.device.core_ids.iter()).zip( + self + .device + .mk_mut_chunks(&mut self.c) + .zip(self.device.mk_chunks(&mut self.a).zip(self.device.mk_chunks(&mut self.b))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..c.len() { + c[i] = a[i] + b[i]; + } + }); + } + }) + .unwrap() + } + + fn triad(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (a, (b, c))) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..a.len() { + a[i] = b[i] + scalar * c[i] + } + }); + } + }) + .unwrap() + } + + fn nstream(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (a, (b, c))) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..a.len() { + a[i] += b[i] + scalar * c[i] + } + }); + } + }) + .unwrap() + } + + fn dot(&mut self) -> T { + let mut partial_sum = vec![T::zero(); self.device.ncore]; + thread::scope(|s| { + let pin = self.device.pin; + let a = &self.a; + let b = &self.b; + let chunk_indices = |i: usize| { + let chunk_size = self.device.chunk_size(self.size); + let start = i * chunk_size; + start..((start + chunk_size).min(self.size)) + }; + for (t, (n, acc)) in self.device.core_ids.iter().zip(partial_sum.iter_mut().enumerate()) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + let mut p = T::zero(); + for i in chunk_indices(n) { + p += a[i] * b[i]; + } + *acc = p; + }); + } + }) + .unwrap(); + partial_sum.into_iter().sum() + } +} diff --git a/src/rust/rust-stream/src/lib.rs b/src/rust/rust-stream/src/lib.rs new file mode 100644 index 0000000..3ac72c3 --- /dev/null +++ b/src/rust/rust-stream/src/lib.rs @@ -0,0 +1,435 @@ +#![feature(allocator_api)] +#![feature(vec_into_raw_parts)] + +use std::alloc::System; +use std::env; +use std::fmt::{Debug, Display}; +use std::iter::Sum; +use std::mem::size_of; +use std::time::Duration; + +use num_traits::abs; +use structopt::StructOpt; +use tabular::{Row, Table}; + +use crate::arc_stream::ArcDevice; +use crate::crossbeam_stream::CrossbeamDevice; +use crate::plain_stream::SerialDevice; +use crate::rayon_stream::RayonDevice; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; +use crate::unsafe_stream::UnsafeDevice; + +mod arc_stream; +mod crossbeam_stream; +mod plain_stream; +mod rayon_stream; +mod stream; +mod unsafe_stream; + +#[derive(Debug, StructOpt)] +struct Options { + /// List available devices + #[structopt(long)] + list: bool, + /// Select device at + #[structopt(long, default_value = "0")] + device: usize, + /// Run the test times (NUM >= 2) + #[structopt(long, short = "n", default_value = "100")] + numtimes: usize, + /// Use elements in the array + #[structopt(long, short = "s", default_value = "33554432")] + arraysize: usize, + /// Use floats (rather than doubles) + #[structopt(long)] + float: bool, + /// Only run triad + #[structopt(long)] + triad_only: bool, + /// Only run nstream + #[structopt(long)] + nstream_only: bool, + /// Output as csv table + #[structopt(long)] + csv: bool, + /// Use MiB=2^20 for bandwidth calculation (default MB=10^6) + #[structopt(long)] + mibibytes: bool, + /// Use libc malloc instead of the Rust's allocator for benchmark array allocation + #[structopt(name = "malloc", long)] + malloc: bool, + /// Initialise each benchmark array at allocation time on the main thread + #[structopt(name = "init", long)] + init: bool, + /// Pin threads to distinct cores, this has NO effect in Rayon devices + #[structopt(long)] + pin: bool, +} + +#[derive(PartialEq)] +enum Benchmark { + All, + Triad, + NStream, +} + +fn check_solution, D, A: AllocatorType>( + benchmark: Benchmark, numtimes: usize, vec: &StreamData, dot_sum: Option, +) -> bool { + let (mut gold_a, mut gold_b, mut gold_c) = vec.init; + for _ in 0..numtimes { + match benchmark { + Benchmark::All => { + gold_c = gold_a; + gold_b = vec.scalar * gold_c; + gold_c = gold_a + gold_b; + gold_a = gold_b + vec.scalar * gold_c; + } + Benchmark::Triad => { + gold_a = gold_b + vec.scalar * gold_c; + } + Benchmark::NStream => { + gold_a += gold_b + vec.scalar * gold_c; + } + }; + } + let tolerance = T::epsilon().into() * 100.0f64; + let validate_xs = |name: &str, xs: &Vec, from: T| { + let error = (xs.iter().map(|x| abs(*x - from)).sum::()).into() / xs.len() as f64; + let fail = error > tolerance; + if fail { + eprintln!("Validation failed on {}[]. Average error {} ", name, error); + } + !fail + }; + let a_ok = validate_xs("a", &vec.a, gold_a); + let b_ok = validate_xs("b", &vec.b, gold_b); + let c_ok = validate_xs("c", &vec.c, gold_c); + let dot_ok = dot_sum.map_or(true, |sum| { + let gold_sum = (gold_a * gold_b).into() * vec.size as f64; + let error = abs((sum.into() - gold_sum) / gold_sum); + let fail = error > 1.0e-8; + if fail { + eprintln!( + "Validation failed on sum. Error {} \nSum was {} but should be {}", + error, sum, gold_sum + ); + } + !fail + }); + + a_ok && b_ok && c_ok && dot_ok +} + +fn run_cpu + Display, D, A: AllocatorType>( + option: &Options, mut stream: StreamData, +) -> bool +where StreamData: RustStream { + let benchmark = match (option.nstream_only, option.triad_only) { + (true, false) => Benchmark::NStream, + (false, true) => Benchmark::Triad, + (false, false) => Benchmark::All, + (true, true) => { + panic!("Both triad and nstream are enabled, pick one or omit both to run all benchmarks") + } + }; + + let array_bytes = option.arraysize * size_of::(); + let total_bytes = array_bytes * 3; + let (mega_scale, mega_suffix, giga_scale, giga_suffix) = if !option.mibibytes { + (1.0e-6, "MB", 1.0e-9, "GB") + } else { + (2f64.powi(-20), "MiB", 2f64.powi(-30), "GiB") + }; + + if !option.csv { + println!( + "Running {} {} times", + match benchmark { + Benchmark::All => "kernels", + Benchmark::Triad => "triad", + Benchmark::NStream => "nstream", + }, + option.numtimes + ); + + if benchmark == Benchmark::Triad { + println!("Number of elements: {}", option.arraysize); + } + + println!("Precision: {}", if option.float { "float" } else { "double" }); + println!( + "Array size: {:.1} {} (={:.1} {})", + mega_scale * array_bytes as f64, + mega_suffix, + giga_scale * array_bytes as f64, + giga_suffix + ); + println!( + "Total size: {:.1} {} (={:.1} {})", + mega_scale * total_bytes as f64, + mega_suffix, + giga_scale * total_bytes as f64, + giga_suffix + ); + } + + stream.init_arrays(); + + let tabulate = |xs: &Vec, name: &str, t_size: usize| -> Vec<(&str, String)> { + let tail = &xs[1..]; // tail only + + // do stats + let max = tail.iter().max().map(|d| d.as_secs_f64()); + let min = tail.iter().min().map(|d| d.as_secs_f64()); + match (min, max) { + (Some(min), Some(max)) => { + let avg: f64 = tail.iter().map(|d| d.as_secs_f64()).sum::() / tail.len() as f64; + let mbps = mega_scale * (t_size as f64) / min; + if option.csv { + vec![ + ("function", name.to_string()), + ("num_times", option.numtimes.to_string()), + ("n_elements", option.arraysize.to_string()), + ("sizeof", t_size.to_string()), + ( + if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" }, + mbps.to_string(), + ), + ("min_runtime", min.to_string()), + ("max_runtime", max.to_string()), + ("avg_runtime", avg.to_string()), + ] + } else { + vec![ + ("Function", name.to_string()), + (if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }, format!("{:.3}", mbps)), + ("Min (sec)", format!("{:.5}", min)), + ("Max", format!("{:.5}", max)), + ("Average", format!("{:.5}", avg)), + ] + } + } + (_, _) => panic!("No min/max element for {}(size={})", name, t_size), + } + }; + + let tabulate_all = |xs: Vec>| { + match xs.as_slice() { + [head, ..] => { + if option.csv { + println!("{}", head.iter().map(|(col, _)| *col).collect::>().join(",")); + for kvs in xs { + println!("{}", kvs.iter().map(|(_, val)| val.clone()).collect::>().join(",")); + } + } else { + let mut table = Table::new(&vec!["{:<}"; head.len()].join(" ")); + table.add_row(head.iter().fold(Row::new(), |row, (col, _)| row.with_cell(col))); + for kvs in xs { + table.add_row(kvs.iter().fold(Row::new(), |row, (_, val)| row.with_cell(val))); + } + print!("{}", table); + } + } + _ => panic!("Empty tabulation"), + }; + }; + + let solutions_correct = match benchmark { + Benchmark::All => { + let (results, sum) = stream.run_all(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum)); + tabulate_all(vec![ + tabulate(&results.copy, "Copy", 2 * array_bytes), + tabulate(&results.mul, "Mul", 2 * array_bytes), + tabulate(&results.add, "Add", 3 * array_bytes), + tabulate(&results.triad, "Triad", 3 * array_bytes), + tabulate(&results.dot, "Dot", 2 * array_bytes), + ]); + correct + } + Benchmark::NStream => { + let results = stream.run_nstream(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, None); + tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]); + correct + } + Benchmark::Triad => { + let results = stream.run_triad(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, None); + let total_bytes = 3 * array_bytes * option.numtimes; + let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64()); + println!("Runtime (seconds): {:.5}", results.as_secs_f64()); + println!("Bandwidth ({}/s): {:.3} ", giga_suffix, bandwidth); + correct + } + }; + stream.clean_up(); + solutions_correct +} + +const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); + +static START_A: f32 = 0.1; +static START_B: f32 = 0.2; +static START_C: f32 = 0.0; +static START_SCALAR: f32 = 0.4; + +static FLOAT_INIT_SCALAR: f32 = START_SCALAR; +static FLOAT_INIT: (f32, f32, f32) = (START_A, START_B, START_C); + +static DOUBLE_INIT_SCALAR: f64 = START_SCALAR as f64; +static DOUBLE_INIT: (f64, f64, f64) = (START_A as f64, START_B as f64, START_C as f64); + +pub fn run(args: &Vec) -> bool { + let opt: Options = Options::from_iter(args); + + if opt.numtimes < 2 { + panic!("numtimes must be >= 2") + } + + let alloc = System; + let alloc_name = if opt.malloc { "libc-malloc" } else { "rust-system" }; + + fn mk_data( + opt: &Options, init: (T, T, T), scalar: T, dev: D, alloc: A, + ) -> StreamData { + StreamData::new_in(opt.arraysize, scalar, init, dev, alloc, opt.malloc, opt.init) + } + + let num_thread_key = "BABELSTREAM_NUM_THREADS"; + let max_ncores = num_cpus::get(); + let ncores = match env::var(num_thread_key) { + Ok(v) => match v.parse::() { + Err(bad) => { + colour::e_yellow_ln!( + "Cannot parse {} (reason: {}), defaulting to {}", + bad, + num_thread_key, + max_ncores + ); + max_ncores + } + Ok(n) if n <= 0 || n > max_ncores as i64 => { + println!("{} out of bound ({}), defaulting to {}", num_thread_key, n, max_ncores); + max_ncores + } + Ok(n) => n as usize, + }, + Err(_) => { + println!("{} not set, defaulting to max ({})", num_thread_key, max_ncores); + max_ncores + } + }; + + let rayon_device = &|| { + let rayon_num_thread_key = "RAYON_NUM_THREADS"; + if env::var(rayon_num_thread_key).is_ok() { + colour::e_yellow_ln!("{} is ignored, set {} instead", rayon_num_thread_key, num_thread_key) + } + let dev = RayonDevice { + pool: rayon::ThreadPoolBuilder::default().num_threads(ncores).build().unwrap(), + }; + if !opt.csv { + println!("Using {} thread(s), alloc={}", dev.pool.current_num_threads(), alloc_name); + if opt.pin { + colour::e_yellow_ln!("Pinning threads have no effect on Rayon!") + } + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let arc_device = &|| { + if !opt.csv { + println!("Using {} thread, pin={}, alloc={}", ncores, opt.pin, alloc_name); + } + if opt.float { + let dev = ArcDevice::::new(ncores, opt.pin, alloc); + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + let dev = ArcDevice::::new(ncores, opt.pin, alloc); + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let unsafe_device = &|| { + if !opt.csv { + println!("Using {} thread, pin={}, alloc={}", ncores, opt.pin, alloc_name); + } + if opt.float { + let dev = UnsafeDevice::::new(ncores, opt.pin); + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + let dev = UnsafeDevice::::new(ncores, opt.pin); + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let crossbeam_device = &|| { + let dev = CrossbeamDevice::new(ncores, opt.pin); + if !opt.csv { + println!("Using {} thread(s), pin={}, alloc={}", ncores, opt.pin, alloc_name) + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let st_device = &|| { + let dev = SerialDevice { pin: opt.pin }; + if !opt.csv { + println!("Using 1 thread, pin={}, alloc={}", opt.pin, alloc_name); + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let devices: Vec<(String, &'_ dyn Fn() -> bool)> = vec![ + ("CPU (Single threaded)".to_string(), st_device), + ("CPU (Rayon)".to_string(), rayon_device), + (format!("CPU (Arc, pinning={})", opt.pin), arc_device), + (format!("CPU (Unsafe, pinning={})", opt.pin), unsafe_device), + (format!("CPU (Crossbeam, pinning={})", opt.pin), crossbeam_device), + ]; + + if opt.list { + devices.iter().enumerate().for_each(|(i, (name, _))| { + println!("[{}] {}", i, name); + }); + true + } else { + match devices.get(opt.device) { + Some((name, run)) => { + if !&opt.csv { + println!( + "BabelStream\n\ + Version: {}\n\ + Implementation: Rust; {}", + VERSION.unwrap_or("unknown"), + name + ); + if opt.init { + println!("Initialising arrays on main thread"); + } + } + run() + } + None => { + eprintln!("Device index {} not available", opt.device); + false + } + } + } +} diff --git a/src/rust/rust-stream/src/main.rs b/src/rust/rust-stream/src/main.rs new file mode 100644 index 0000000..8c99087 --- /dev/null +++ b/src/rust/rust-stream/src/main.rs @@ -0,0 +1,5 @@ +fn main() { + if !rust_stream::run(&std::env::args().collect::>()) { + std::process::exit(1); + } +} diff --git a/src/rust/rust-stream/src/plain_stream.rs b/src/rust/rust-stream/src/plain_stream.rs new file mode 100644 index 0000000..135a7bc --- /dev/null +++ b/src/rust/rust-stream/src/plain_stream.rs @@ -0,0 +1,61 @@ +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; +use core_affinity::CoreId; + +pub struct SerialDevice { + pub(crate) pin: bool, +} + +// single threaded version +impl RustStream for StreamData { + fn init_arrays(&mut self) { + if self.device.pin { + core_affinity::set_for_current( + match core_affinity::get_core_ids().as_ref().map(|x| x.first()) { + Some(Some(x)) => *x, + _ => CoreId { id: 0 }, + }, + ); + } + self.a.fill(self.init.0); + self.b.fill(self.init.1); + self.c.fill(self.init.2); + } + + fn copy(&mut self) { + for i in 0..self.size { + self.c[i] = self.a[i]; + } + } + + fn mul(&mut self) { + for i in 0..self.size { + self.b[i] = self.scalar * self.c[i]; + } + } + + fn add(&mut self) { + for i in 0..self.size { + self.c[i] = self.a[i] + self.b[i]; + } + } + + fn triad(&mut self) { + for i in 0..self.size { + self.a[i] = self.b[i] + self.scalar * self.c[i]; + } + } + + fn nstream(&mut self) { + for i in 0..self.size { + self.a[i] += self.b[i] + self.scalar * self.c[i]; + } + } + + fn dot(&mut self) -> T { + let mut sum = T::default(); + for i in 0..self.size { + sum += self.a[i] * self.b[i]; + } + sum + } +} diff --git a/src/rust/rust-stream/src/rayon_stream.rs b/src/rust/rust-stream/src/rayon_stream.rs new file mode 100644 index 0000000..d25d115 --- /dev/null +++ b/src/rust/rust-stream/src/rayon_stream.rs @@ -0,0 +1,77 @@ +use std::iter::Sum; + +use rayon::prelude::*; +use rayon::ThreadPool; + +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +pub struct RayonDevice { + pub(crate) pool: ThreadPool, +} + +// Rayon version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData +{ + fn init_arrays(&mut self) { + let init = self.init; + self.a.par_iter_mut().for_each(|v| *v = init.0); + self.b.par_iter_mut().for_each(|v| *v = init.1); + self.c.par_iter_mut().for_each(|v| *v = init.2); + } + + fn copy(&mut self) { + let a = &self.a; + let c = &mut self.c; + self.device.pool.install(|| { + (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i]); + }); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let c = &self.c; + let b = &mut self.b; + self + .device + .pool + .install(|| (*b).par_iter_mut().enumerate().for_each(|(i, b)| *b = scalar * c[i])); + } + + fn add(&mut self) { + let a = &self.a; + let b = &self.b; + let c = &mut self.c; + self.device.pool.install(|| (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i] + b[i])) + } + + fn triad(&mut self) { + let scalar = self.scalar; + let a = &mut self.a; + let b = &self.b; + let c = &self.c; + self + .device + .pool + .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a = b[i] + scalar * c[i])) + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let a = &mut self.a; + let b = &self.b; + let c = &self.c; + self + .device + .pool + .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a += b[i] + scalar * c[i])) + } + + fn dot(&mut self) -> T { + let a = &self.a; + let b = &self.b; + self.device.pool.install(|| { + (0..self.size).into_par_iter().fold(|| T::default(), |acc, i| acc + a[i] * b[i]).sum::() + }) + } +} diff --git a/src/rust/rust-stream/src/stream.rs b/src/rust/rust-stream/src/stream.rs new file mode 100644 index 0000000..560c6f1 --- /dev/null +++ b/src/rust/rust-stream/src/stream.rs @@ -0,0 +1,167 @@ +use num_traits::real::Real; +use num_traits::{NumAssign, Signed}; +use std::alloc::Allocator; +use std::fmt::Debug; +use std::time::{Duration, Instant}; + +pub trait AllocatorType: Allocator + Copy + Clone + Default + Debug {} +impl AllocatorType for T {} + +pub struct StreamData { + pub device: D, + pub size: usize, + pub scalar: T, + pub init: (T, T, T), + pub a: Vec, + pub b: Vec, + pub c: Vec, + pub needs_dealloc: bool, +} + +#[inline(always)] +fn timed(f: F) -> Duration { + let start = Instant::now(); + f(); + start.elapsed() +} + +#[inline(always)] +fn timed_mut T>(f: &mut F) -> (Duration, T) { + let start = Instant::now(); + let x = f(); + (start.elapsed(), x) +} + +pub struct AllTiming { + pub copy: T, + pub mul: T, + pub add: T, + pub triad: T, + pub dot: T, +} + +pub trait ArrayType: Real + NumAssign + Signed + Default + Debug {} +impl ArrayType for T {} + +impl StreamData { + pub fn new_in( + size: usize, + scalar: T, + init: (T, T, T), + device: D, + allocator: A, + malloc: bool, // + initialise: bool, // + ) -> StreamData { + let mk_vec = || { + if malloc { + extern crate libc; + use std::mem; + unsafe { + // we do the typical C malloc with a NULL check here + let bytes = mem::size_of::() * size; + let ptr = libc::malloc(bytes as libc::size_t) as *mut T; + if ptr.is_null() { + panic!( + "Cannot allocate {} bytes in `sizeof(T) * size` (T = {}, size = {})", + bytes, + mem::size_of::(), + size + ); + } + let mut xs = Vec::from_raw_parts_in(ptr, size, size, allocator); + if initialise { + xs.fill(T::default()); + } + xs + } + } else { + if initialise { + let mut xs = Vec::new_in(allocator); + xs.resize(size, T::default()); + xs + } else { + // try not to touch the vec after allocation + let mut xs = Vec::with_capacity_in(size, allocator); + unsafe { + xs.set_len(size); + } + xs + } + } + }; + + StreamData { + device, + size, + scalar, + init, + a: mk_vec(), + b: mk_vec(), + c: mk_vec(), + needs_dealloc: malloc, + } + } + pub fn clean_up(self) { + if self.needs_dealloc { + unsafe { + extern crate libc; + let free_ts = move |xs: Vec| { + // make sure we don't call dealloc for vec anymore + // XXX it's important we don't free xs.as_mut_ptr() here and use xs.into_raw_parts_with_alloc() + // as that function handles drops semantic for us + // if we free the the raw ptr directly, the compiler will still drop the vec and then segfault + let (ptr, _, _, _) = xs.into_raw_parts_with_alloc(); + libc::free(ptr as *mut libc::c_void); + }; + free_ts(self.a); + free_ts(self.b); + free_ts(self.c); + } + } + } +} + +pub trait RustStream { + fn init_arrays(&mut self); + fn read_arrays(&mut self) {} // default to no-op as most impl. doesn't need this + fn copy(&mut self); + fn mul(&mut self); + fn add(&mut self); + fn triad(&mut self); + fn nstream(&mut self); + fn dot(&mut self) -> T; + + fn run_all(&mut self, n: usize) -> (AllTiming>, T) { + let mut timings: AllTiming> = AllTiming { + copy: vec![Duration::default(); n], + mul: vec![Duration::default(); n], + add: vec![Duration::default(); n], + triad: vec![Duration::default(); n], + dot: vec![Duration::default(); n], + }; + let mut last_sum = T::default(); + for i in 0..n { + timings.copy[i] = timed(|| self.copy()); + timings.mul[i] = timed(|| self.mul()); + timings.add[i] = timed(|| self.add()); + timings.triad[i] = timed(|| self.triad()); + let (dot, sum) = timed_mut(&mut || self.dot()); + timings.dot[i] = dot; + last_sum = sum; + } + (timings, last_sum) + } + + fn run_triad(&mut self, n: usize) -> Duration { + timed(|| { + for _ in 0..n { + self.triad(); + } + }) + } + + fn run_nstream(&mut self, n: usize) -> Vec { + (0..n).map(|_| timed(|| self.nstream())).collect::>() + } +} diff --git a/src/rust/rust-stream/src/unsafe_stream.rs b/src/rust/rust-stream/src/unsafe_stream.rs new file mode 100644 index 0000000..968cc4e --- /dev/null +++ b/src/rust/rust-stream/src/unsafe_stream.rs @@ -0,0 +1,266 @@ +extern crate core_affinity; + +use std::alloc::Allocator; +use std::iter::Sum; +use std::ops::Range; + +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +use self::core_affinity::CoreId; + +#[derive(Debug, Copy, Clone)] +struct UnsafeData(*mut T, usize); + +impl UnsafeData { + fn empty() -> UnsafeData { UnsafeData(([] as [T; 0]).as_mut_ptr(), 0) } + fn new(xs: &mut Vec) -> UnsafeData { + UnsafeData(xs.as_mut_ptr(), xs.len()) + } + + fn get_slice(&self) -> &mut [T] { unsafe { std::slice::from_raw_parts_mut(self.0, self.1) } } +} + +unsafe impl Send for UnsafeData {} +unsafe impl Sync for UnsafeData {} + +#[derive(Debug, Copy, Clone)] +struct UnsafeRefs { + a: UnsafeData, + b: UnsafeData, + c: UnsafeData, +} + +unsafe impl Send for UnsafeRefs {} +unsafe impl Sync for UnsafeRefs {} + +pub struct UnsafeDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, + data: UnsafeRefs, +} + +impl UnsafeDevice { + pub fn new(ncore: usize, pin: bool) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + + UnsafeDevice { + ncore, + pin, + core_ids, + data: UnsafeRefs { a: UnsafeData::empty(), b: UnsafeData::empty(), c: UnsafeData::empty() }, + } + } + + fn thread_ranges(&self, len: usize) -> Vec<(usize, Range)> { + let chunk = (len as f64 / self.ncore as f64).ceil() as usize; + (0..self.ncore) + .map(|t| { + (t, if t == self.ncore - 1 { (t * chunk)..len } else { (t * chunk)..((t + 1) * chunk) }) + }) + .collect::>() + } +} + +// Unsafe threaded version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData, A> +{ + fn init_arrays(&mut self) { + self.device.data.a = UnsafeData::new(&mut self.a); + self.device.data.b = UnsafeData::new(&mut self.b); + self.device.data.c = UnsafeData::new(&mut self.c); + let init = self.init; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] = init.0; + b[i] = init.1; + c[i] = init.2; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn copy(&mut self) { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let c = data.c.get_slice(); + for i in r { + c[i] = a[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + b[i] = scalar * c[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn add(&mut self) { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + c[i] = a[i] + b[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn triad(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] = b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] += b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn dot(&mut self) -> T { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let mut p = T::default(); + for i in r { + p += a[i] * b[i]; + } + p + }) + }) + .collect::>() + .into_iter() + .map(|t| t.join().unwrap()) + .sum() + } +} diff --git a/src/rust/rust-stream/tests/integration_test.rs b/src/rust/rust-stream/tests/integration_test.rs new file mode 100644 index 0000000..8031a79 --- /dev/null +++ b/src/rust/rust-stream/tests/integration_test.rs @@ -0,0 +1,17 @@ +use rstest::rstest; + +#[rstest] +fn test_main( + #[values(0, 1, 2, 3, 4)] device: usize, // + #[values("", "--pin")] pin: &str, // + #[values("", "--malloc")] malloc: &str, // + #[values("", "--init")] init: &str, // + #[values("", "--triad-only", "--nstream-only")] option: &str, // +) { + let line = format!( + "rust-stream --arraysize 2048 --device {} {} {} {} {}", + device, pin, malloc, init, option + ); + let args = line.split_whitespace().map(|s| s.to_string()).collect::>(); + assert!(rust_stream::run(&args)); +} diff --git a/src/scala/scala-stream/.bsp/sbt.json b/src/scala/scala-stream/.bsp/sbt.json new file mode 100644 index 0000000..2e1edb1 --- /dev/null +++ b/src/scala/scala-stream/.bsp/sbt.json @@ -0,0 +1 @@ +{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]} \ No newline at end of file diff --git a/src/scala/scala-stream/.gitignore b/src/scala/scala-stream/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/src/scala/scala-stream/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/src/scala/scala-stream/.jvmopts b/src/scala/scala-stream/.jvmopts new file mode 100644 index 0000000..c1ef295 --- /dev/null +++ b/src/scala/scala-stream/.jvmopts @@ -0,0 +1,2 @@ +-Xmx4096m +-Xss4m \ No newline at end of file diff --git a/src/scala/scala-stream/.scalafmt.conf b/src/scala/scala-stream/.scalafmt.conf new file mode 100644 index 0000000..8c7d0c8 --- /dev/null +++ b/src/scala/scala-stream/.scalafmt.conf @@ -0,0 +1,34 @@ +version = "3.0.0-RC2" +runner.dialect = scala3 + +style = defaultWithAlign + +maxColumn = 100 + +align.preset = more + +rewrite.rules = [ + AvoidInfix + RedundantBraces + RedundantParens + AsciiSortImports + PreferCurlyFors +] + +rewrite.neverInfix.excludeFilters = [until + to + by + eq + ne + "should.*" + "contain.*" + "must.*" + in + be + taggedAs + thrownBy + synchronized + have + when + size + theSameElementsAs] \ No newline at end of file diff --git a/src/scala/scala-stream/README.md b/src/scala/scala-stream/README.md new file mode 100644 index 0000000..bf0e3f4 --- /dev/null +++ b/src/scala/scala-stream/README.md @@ -0,0 +1,102 @@ +ScalaStream +=========== + +This is an implementation of BabelStream +in [Scala 3](https://docs.scala-lang.org/scala3/new-in-scala3.html) on the JVM. In theory, this +implementation also covers Java. Scala and Java, like any other programming language, has its own +ecosystem of library supported parallel programming frameworks, we currently implement the +following: + +* Parallel streams (introduced in Java 8) - `src/main/scala/scalastream/J8SStream.scala` +* [Scala Parallel Collections](https://github.com/scala/scala-parallel-collections) + - `src/main/scala/scalastream/ParStream.scala` + +As the benchmark is relatively simple, we also implement some baselines: + +* Single threaded Scala `for` (i.e `foreach` sugar) - `src/main/scala/scalastream/PlainStream.scala` +* Manually parallelism with Java executors - `src/main/scala/scalastream/ThreadedStream.scala` + +### Performance considerations + +As Scala 3 defaults to Scala 2.13's standard library, we roll our own `Fractional` typeclass with +liberal use of inlining and specialisation. This is motivated by 2.13 stdlib's lack of +specialisation for primitives types on the default `Fractional` and `Numeric` typeclasses. + +The use of [Spire](https://github.com/typelevel/spire) to mitigate this was attempted, however, due +to its use of Scala 2 macros, it currently doesn't compile with Scala 3. + +### Build & Run + +Prerequisites + +* JDK >= 8 on any of its supported platform; known working implementations: + - OpenJDK + distributions ([Amazon Corretto](https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/downloads-list.html) + , [Azul](https://www.azul.com/downloads/?version=java-11-lts&package=jdk) + , [AdoptOpenJDK](https://adoptopenjdk.net/), etc) + - Oracle Graal CE/EE 8+ + +To run the benchmark, first create a binary: + +```shell +> ./sbt assembly +``` + +The binary will be located at `./target/scala-3.0.0/scala-stream.jar`. Run it with: + +```shell +> java -version +openjdk version "11.0.11" 2021-04-20 +OpenJDK Runtime Environment 18.9 (build 11.0.11+9) +OpenJDK 64-Bit Server VM 18.9 (build 11.0.11+9, mixed mode, sharing) +> java -jar target/scala-3.0.0/scala-stream.jar --help + +``` + +For best results, benchmark with the following JVM flags: + +``` +-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance +-XX:-TieredCompilation # disable C1, go straight to C2 +-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point +``` + +Worked example: + +```shell +> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/scala-3.0.0/scala-stream.jar + +BabelStream +Version: 3.4.0 +Implementation: Scala Parallel Collections; Scala (Java 11.0.11; Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64) +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 4087.077 0.13136 0.24896 0.15480 +Mul 2934.709 0.18294 0.28706 0.21627 +Add 3016.342 0.26698 0.39835 0.31119 +Triad 3016.496 0.26697 0.37612 0.31040 +Dot 2216.096 0.24226 0.41235 0.28264 + +``` + +### Graal Native Image + +The port has partial support for Graal Native Image, to generate one, run: + +```shell +> ./sbt nativeImage +``` + +The ELF binary will be located at `./target/native-image/scala-stream`, relocation should work on +the same architecture the binary is built on. + +There's an ongoing bug with Scala 3 's use of `lazy val`s where the program crashes at declaration +site. Currently, Scala Parallel Collections uses this feature internally, so selecting this device +will crash at runtime. + +The bug originates from the use of `Unsafe` in `lazy val` for thready safety guarantees. It seems +that Graal only supports limited uses of this JVM implementation detail and Scala 3 happens to be on +the unsupported side. \ No newline at end of file diff --git a/src/scala/scala-stream/build.sbt b/src/scala/scala-stream/build.sbt new file mode 100644 index 0000000..4194acb --- /dev/null +++ b/src/scala/scala-stream/build.sbt @@ -0,0 +1,29 @@ +lazy val mainCls = Some("scalastream.App") + +lazy val root = (project in file(".")) + .enablePlugins(NativeImagePlugin) + .settings( + scalaVersion := "3.0.0", + version := "3.4.0", + organization := "uk.ac.bristol.uob-hpc", + organizationName := "University of Bristol", + Compile / mainClass := mainCls, + assembly / mainClass := mainCls, + scalacOptions ~= filterConsoleScalacOptions, + assembly / assemblyJarName := "scala-stream.jar", + nativeImageOptions := Seq( + "--no-fallback", + "-H:ReflectionConfigurationFiles=../../reflect-config.json" + ), + nativeImageVersion := "21.1.0", + (Global / excludeLintKeys) += nativeImageVersion, + name := "scala-stream", + libraryDependencies ++= Seq( + // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part + ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), + // par also uses lazy val at some point, so it doesn't work in nativeImage + "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3", + "net.openhft" % "affinity" % "3.21ea1", + "org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity + ) + ) diff --git a/src/scala/scala-stream/project/build.properties b/src/scala/scala-stream/project/build.properties new file mode 100644 index 0000000..19479ba --- /dev/null +++ b/src/scala/scala-stream/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.5.2 diff --git a/src/scala/scala-stream/project/plugins.sbt b/src/scala/scala-stream/project/plugins.sbt new file mode 100644 index 0000000..2c82902 --- /dev/null +++ b/src/scala/scala-stream/project/plugins.sbt @@ -0,0 +1,6 @@ +addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") +addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") diff --git a/src/scala/scala-stream/reflect-config.json b/src/scala/scala-stream/reflect-config.json new file mode 100644 index 0000000..9e8b089 --- /dev/null +++ b/src/scala/scala-stream/reflect-config.json @@ -0,0 +1,11 @@ +[ + { + "name": "sun.misc.Unsafe", + "fields": [ + { + "name": "theUnsafe", + "allowUnsafeAccess": true + } + ] + } +] \ No newline at end of file diff --git a/src/scala/scala-stream/sbt b/src/scala/scala-stream/sbt new file mode 100755 index 0000000..efdfda6 --- /dev/null +++ b/src/scala/scala-stream/sbt @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +./sbt-dist/bin/sbt "$@" \ No newline at end of file diff --git a/src/scala/scala-stream/sbt-dist/bin/java9-rt-export.jar b/src/scala/scala-stream/sbt-dist/bin/java9-rt-export.jar new file mode 100644 index 0000000..cbabfb0 Binary files /dev/null and b/src/scala/scala-stream/sbt-dist/bin/java9-rt-export.jar differ diff --git a/src/scala/scala-stream/sbt-dist/bin/sbt b/src/scala/scala-stream/sbt-dist/bin/sbt new file mode 100755 index 0000000..cca77be --- /dev/null +++ b/src/scala/scala-stream/sbt-dist/bin/sbt @@ -0,0 +1,177 @@ +#!/usr/bin/env bash + + +### ------------------------------- ### +### Helper methods for BASH scripts ### +### ------------------------------- ### + +realpath () { +( + TARGET_FILE="$1" + FIX_CYGPATH="$2" + + cd "$(dirname "$TARGET_FILE")" + TARGET_FILE=$(basename "$TARGET_FILE") + + COUNT=0 + while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] + do + TARGET_FILE=$(readlink "$TARGET_FILE") + cd "$(dirname "$TARGET_FILE")" + TARGET_FILE=$(basename "$TARGET_FILE") + COUNT=$(($COUNT + 1)) + done + + # make sure we grab the actual windows path, instead of cygwin's path. + if [[ "x$FIX_CYGPATH" != "x" ]]; then + echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")" + else + echo "$(pwd -P)/$TARGET_FILE" + fi +) +} + + +# Uses uname to detect if we're in the odd cygwin environment. +is_cygwin() { + local os=$(uname -s) + case "$os" in + CYGWIN*) return 0 ;; + MINGW*) return 0 ;; + MSYS*) return 0 ;; + *) return 1 ;; + esac +} + +# TODO - Use nicer bash-isms here. +CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi) + + +# This can fix cygwin style /cygdrive paths so we get the +# windows style paths. +cygwinpath() { + local file="$1" + if [[ "$CYGWIN_FLAG" == "true" ]]; then + echo $(cygpath -w $file) + else + echo $file + fi +} + +. "$(dirname "$(realpath "$0")")/sbt-launch-lib.bash" + + +declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" +declare -r sbt_opts_file=".sbtopts" +declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" +declare -r dist_sbt_opts_file="${sbt_home}/conf/sbtopts" +declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt" + +usage() { + cat < path to global settings/plugins directory (default: ~/.sbt) + -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) + -ivy path to local Ivy repository (default: ~/.ivy2) + -mem set memory options (default: $sbt_default_mem, which is $(get_mem_opts)) + -no-share use all local caches; no sharing + -no-global uses global caches, but does not use global ~/.sbt directory. + -jvm-debug Turn on JVM debugging, open at the given port. + -batch Disable interactive mode + + # sbt version (default: from project/build.properties if present, else latest release) + -sbt-version use the specified version of sbt + -sbt-jar use the specified jar as the sbt launcher + -sbt-rc use an RC version of sbt + -sbt-snapshot use a snapshot version of sbt + + # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) + -java-home alternate JAVA_HOME + + # jvm options and output control + JAVA_OPTS environment variable, if unset uses "$java_opts" + .jvmopts if this file exists in the current directory, its contents + are appended to JAVA_OPTS + SBT_OPTS environment variable, if unset uses "$default_sbt_opts" + .sbtopts if this file exists in the current directory, its contents + are prepended to the runner args + /etc/sbt/sbtopts if this file exists, it is prepended to the runner args + -Dkey=val pass -Dkey=val directly to the java runtime + -J-X pass option -X directly to the java runtime + (-J is stripped) + -S-X add -X to sbt's scalacOptions (-S is stripped) + +In the case of duplicated or conflicting options, the order above +shows precedence: JAVA_OPTS lowest, command line options highest. +EOM +} + + + +process_my_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; + -no-share) addJava "$noshare_opts" && shift ;; + -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; + -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; + -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; + -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; + -batch) exec + link=$(expr "$ls" : '.*-> \(.*\)$') + if expr "$link" : '/.*' > /dev/null; then + SCRIPT="$link" + else + SCRIPT=$(dirname "$SCRIPT")/"$link" + fi +done +declare -r sbt_bin_dir="$(dirname "$SCRIPT")" +declare -r sbt_home="$(dirname "$sbt_bin_dir")" + +echoerr () { + echo 1>&2 "$@" +} +vlog () { + [[ $verbose || $debug ]] && echoerr "$@" +} +dlog () { + [[ $debug ]] && echoerr "$@" +} + +jar_file () { + echo "$(cygwinpath "${sbt_home}/bin/sbt-launch.jar")" +} + +acquire_sbt_jar () { + sbt_jar="$(jar_file)" + + if [[ ! -f "$sbt_jar" ]]; then + echoerr "Could not find launcher jar: $sbt_jar" + exit 2 + fi +} + +rt_export_file () { + echo "${sbt_bin_dir}/java9-rt-export.jar" +} + +execRunner () { + # print the arguments one to a line, quoting any containing spaces + [[ $verbose || $debug ]] && echo "# Executing command line:" && { + for arg; do + if printf "%s\n" "$arg" | grep -q ' '; then + printf "\"%s\"\n" "$arg" + else + printf "%s\n" "$arg" + fi + done + echo "" + } + + # THis used to be exec, but we loose the ability to re-hook stty then + # for cygwin... Maybe we should flag the feature here... + "$@" +} + +addJava () { + dlog "[addJava] arg = '$1'" + java_args=( "${java_args[@]}" "$1" ) +} +addSbt () { + dlog "[addSbt] arg = '$1'" + sbt_commands=( "${sbt_commands[@]}" "$1" ) +} +addResidual () { + dlog "[residual] arg = '$1'" + residual_args=( "${residual_args[@]}" "$1" ) +} +addDebugger () { + addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1" +} + +get_mem_opts () { + # if we detect any of these settings in ${JAVA_OPTS} or ${JAVA_TOOL_OPTIONS} we need to NOT output our settings. + # The reason is the Xms/Xmx, if they don't line up, cause errors. + if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + elif [[ "${JAVA_TOOL_OPTIONS}" == *-Xmx* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-Xms* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_TOOL_OPTIONS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + elif [[ "${SBT_OPTS}" == *-Xmx* ]] || [[ "${SBT_OPTS}" == *-Xms* ]] || [[ "${SBT_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${SBT_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${SBT_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then + echo "" + else + # a ham-fisted attempt to move some memory settings in concert + # so they need not be messed around with individually. + local mem=${1:-$sbt_default_mem} + local codecache=$(( $mem / 8 )) + (( $codecache > 128 )) || codecache=128 + (( $codecache < 512 )) || codecache=512 + local class_metadata_size=$(( $codecache * 2 )) + if [[ -z $java_version ]]; then + java_version=$(jdk_version) + fi + local class_metadata_opt=$((( $java_version < 8 )) && echo "MaxPermSize" || echo "MaxMetaspaceSize") + + local arg_xms=$([[ "${java_args[@]}" == *-Xms* ]] && echo "" || echo "-Xms${mem}m") + local arg_xmx=$([[ "${java_args[@]}" == *-Xmx* ]] && echo "" || echo "-Xmx${mem}m") + local arg_rccs=$([[ "${java_args[@]}" == *-XX:ReservedCodeCacheSize* ]] && echo "" || echo "-XX:ReservedCodeCacheSize=${codecache}m") + local arg_meta=$([[ "${java_args[@]}" == *-XX:${class_metadata_opt}* && ! (( $java_version < 8 )) ]] && echo "" || echo "-XX:${class_metadata_opt}=${class_metadata_size}m") + + echo "${arg_xms} ${arg_xmx} ${arg_rccs} ${arg_meta}" + fi +} + +get_gc_opts () { + local older_than_9=$(( $java_version < 9 )) + + if [[ "$older_than_9" == "1" ]]; then + # don't need to worry about gc + echo "" + elif [[ "${JAVA_OPTS}" =~ Use.*GC ]] || [[ "${JAVA_TOOL_OPTIONS}" =~ Use.*GC ]] || [[ "${SBT_OPTS}" =~ Use.*GC ]] ; then + # GC arg has been passed in - don't change + echo "" + else + # Java 9+ so revert to old + echo "-XX:+UseParallelGC" + fi +} + +require_arg () { + local type="$1" + local opt="$2" + local arg="$3" + if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then + echo "$opt requires <$type> argument" + exit 1 + fi +} + +is_function_defined() { + declare -f "$1" > /dev/null +} + +# parses JDK version from the -version output line. +# 8 for 1.8.0_nn, 9 for 9-ea etc, and "no_java" for undetected +jdk_version() { + local result + local lines=$("$java_cmd" -Xms32M -Xmx32M -version 2>&1 | tr '\r' '\n') + local IFS=$'\n' + for line in $lines; do + if [[ (-z $result) && ($line = *"version \""*) ]] + then + local ver=$(echo $line | sed -e 's/.*version "\(.*\)"\(.*\)/\1/; 1q') + # on macOS sed doesn't support '?' + if [[ $ver = "1."* ]] + then + result=$(echo $ver | sed -e 's/1\.\([0-9]*\)\(.*\)/\1/; 1q') + else + result=$(echo $ver | sed -e 's/\([0-9]*\)\(.*\)/\1/; 1q') + fi + fi + done + if [[ -z $result ]] + then + result=no_java + fi + echo "$result" +} + +process_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|-help) usage; exit 1 ;; + -v|-verbose) verbose=1 && shift ;; + -d|-debug) debug=1 && addSbt "-debug" && shift ;; + + -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; + -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; + -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; + -batch) exec /dev/null 2>&1 && { + mkdir -p "$target_preloaded" + rsync -a --ignore-existing "$source_preloaded" "$target_preloaded" + } + } + } +} + +# Detect that we have java installed. +checkJava() { + local required_version="$1" + # Now check to see if it's a good enough version + local good_enough="$(expr $java_version ">=" $required_version)" + if [[ "$java_version" == "" ]]; then + echo + echo "No Java Development Kit (JDK) installation was detected." + echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download. + echo + exit 1 + elif [[ "$good_enough" != "1" ]]; then + echo + echo "The Java Development Kit (JDK) installation you have is not up to date." + echo $script_name requires at least version $required_version+, you have + echo version $java_version + echo + echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download + echo a valid JDK and install before running $script_name. + echo + exit 1 + fi +} + +copyRt() { + local at_least_9="$(expr $java_version ">=" 9)" + if [[ "$at_least_9" == "1" ]]; then + rtexport=$(rt_export_file) + # The grep for java9-rt-ext- matches the filename prefix printed in Export.java + java9_ext=$("$java_cmd" ${JAVA_OPTS} ${SBT_OPTS:-$default_sbt_opts} ${java_args[@]} \ + -jar "$rtexport" --rt-ext-dir | grep java9-rt-ext-) + java9_rt=$(echo "$java9_ext/rt.jar") + vlog "[copyRt] java9_rt = '$java9_rt'" + if [[ ! -f "$java9_rt" ]]; then + echo Copying runtime jar. + mkdir -p "$java9_ext" + execRunner "$java_cmd" \ + ${JAVA_OPTS} \ + ${SBT_OPTS:-$default_sbt_opts} \ + ${java_args[@]} \ + -jar "$rtexport" \ + "${java9_rt}" + fi + addJava "-Dscala.ext.dirs=${java9_ext}" + fi +} + +run() { + # process the combined args, then reset "$@" to the residuals + process_args "$@" + set -- "${residual_args[@]}" + argumentCount=$# + + # Copy preloaded repo to user's preloaded directory + syncPreloaded + + # no jar? download it. + [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || { + # still no jar? uh-oh. + echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar" + exit 1 + } + + # TODO - java check should be configurable... + checkJava "6" + + # Java 9 support + copyRt + + #If we're in cygwin, we should use the windows config, and terminal hacks + if [[ "$CYGWIN_FLAG" == "true" ]]; then + stty -icanon min 1 -echo > /dev/null 2>&1 + addJava "-Djline.terminal=jline.UnixTerminal" + addJava "-Dsbt.cygwin=true" + fi + + # run sbt + execRunner "$java_cmd" \ + $(get_mem_opts $sbt_mem) \ + $(get_gc_opts) \ + ${JAVA_OPTS} \ + ${SBT_OPTS:-$default_sbt_opts} \ + ${java_args[@]} \ + -jar "$sbt_jar" \ + "${sbt_commands[@]}" \ + "${residual_args[@]}" + + exit_code=$? + + # Clean up the terminal from cygwin hacks. + if [[ "$CYGWIN_FLAG" == "true" ]]; then + stty icanon echo > /dev/null 2>&1 + fi + exit $exit_code +} diff --git a/src/scala/scala-stream/sbt-dist/bin/sbt-launch.jar b/src/scala/scala-stream/sbt-dist/bin/sbt-launch.jar new file mode 100644 index 0000000..26ab884 Binary files /dev/null and b/src/scala/scala-stream/sbt-dist/bin/sbt-launch.jar differ diff --git a/src/scala/scala-stream/sbt-dist/bin/sbt.bat b/src/scala/scala-stream/sbt-dist/bin/sbt.bat new file mode 100644 index 0000000..1827961 --- /dev/null +++ b/src/scala/scala-stream/sbt-dist/bin/sbt.bat @@ -0,0 +1,212 @@ +@REM SBT launcher script +@REM +@REM Environment: +@REM JAVA_HOME - location of a JDK home dir (mandatory) +@REM SBT_OPTS - JVM options (optional) +@REM Configuration: +@REM sbtconfig.txt found in the SBT_HOME. + +@REM ZOMG! We need delayed expansion to build up CFG_OPTS later +@setlocal enabledelayedexpansion + +@echo off +set SBT_HOME=%~dp0 +set SBT_ARGS= + +rem FIRST we load the config file of extra options. +set FN=%SBT_HOME%\..\conf\sbtconfig.txt +set CFG_OPTS= +FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( + set DO_NOT_REUSE_ME=%%i + rem ZOMG (Part #2) WE use !! here to delay the expansion of + rem CFG_OPTS, otherwise it remains "" for this loop. + set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! +) + +rem poor man's jenv (which is not available on Windows) +IF DEFINED JAVA_HOMES ( + IF EXIST .java-version FOR /F %%A IN (.java-version) DO ( + SET JAVA_HOME=%JAVA_HOMES%\%%A + SET JDK_HOME=%JAVA_HOMES%\%%A + ) +) +rem must set PATH or wrong javac is used for java projects +IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%" + +rem users can set JAVA_OPTS via .jvmopts (sbt-extras style) +IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO ( + SET _jvmopts_line=%%A + IF NOT "!_jvmopts_line:~0,1!"=="#" ( + SET JAVA_OPTS=%%A !JAVA_OPTS! + ) +) +rem We use the value of the JAVACMD environment variable if defined +set _JAVACMD=%JAVACMD% + +if "%_JAVACMD%"=="" ( + if not "%JAVA_HOME%"=="" ( + if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" + ) +) + +if "%_JAVACMD%"=="" set _JAVACMD=java + +rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. +set _JAVA_OPTS=%JAVA_OPTS% +if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% + +set INIT_SBT_VERSION=1.2.8 + +:args_loop +if "%~1" == "" goto args_end + +if "%~1" == "-jvm-debug" ( + set JVM_DEBUG=true + set /a JVM_DEBUG_PORT=5005 2>nul >nul +) else if "!JVM_DEBUG!" == "true" ( + set /a JVM_DEBUG_PORT=%1 2>nul >nul + if not "%~1" == "!JVM_DEBUG_PORT!" ( + set SBT_ARGS=!SBT_ARGS! %1 + ) +) else if /I "%~1" == "new" ( + set sbt_new=true + set SBT_ARGS=!SBT_ARGS! %1 +) else ( + set SBT_ARGS=!SBT_ARGS! %1 +) + +shift +goto args_loop +:args_end + +rem Confirm a user's intent if the current directory does not look like an sbt +rem top-level directory and the "new" command was not given. +if not exist build.sbt ( + if not exist project\ ( + if not defined sbt_new ( + echo [warn] Neither build.sbt nor a 'project' directory in the current directory: %CD% + setlocal +:confirm + echo c^) continue + echo q^) quit + + set /P reply=?^ + if /I "!reply!" == "c" ( + goto confirm_end + ) else if /I "!reply!" == "q" ( + exit /B 1 + ) + + goto confirm +:confirm_end + endlocal + ) + ) +) + +call :process + +call :checkjava + +call :copyrt + +if defined JVM_DEBUG_PORT ( + set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT! +) + +call :sync_preloaded + +call :run %SBT_ARGS% + +if ERRORLEVEL 1 goto error +goto end + +:run + +"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* +goto :eof + +:process +rem Parses x out of 1.x; for example 8 out of java version 1.8.0_xx +rem Otherwise, parses the major version; 9 out of java version 9-ea +set JAVA_VERSION=0 +for /f "tokens=3" %%g in ('"%_JAVACMD%" -Xms32M -Xmx32M -version 2^>^&1 ^| findstr /i version') do ( + set JAVA_VERSION=%%g +) +set JAVA_VERSION=%JAVA_VERSION:"=% +for /f "delims=.-_ tokens=1-2" %%v in ("%JAVA_VERSION%") do ( + if /I "%%v" EQU "1" ( + set JAVA_VERSION=%%w + ) else ( + set JAVA_VERSION=%%v + ) +) +exit /B 0 + +:checkjava +set required_version=6 +if /I %JAVA_VERSION% GEQ %required_version% ( + exit /B 0 +) +echo. +echo The Java Development Kit (JDK) installation you have is not up to date. +echo sbt requires at least version %required_version%+, you have +echo version %JAVA_VERSION% +echo. +echo Please go to http://www.oracle.com/technetwork/java/javase/downloads/ and download +echo a valid JDK and install before running sbt. +echo. +exit /B 1 + +:copyrt +if /I %JAVA_VERSION% GEQ 9 ( + set rtexport=!SBT_HOME!java9-rt-export.jar + + "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" --rt-ext-dir > "%TEMP%.\rtext.txt" + set /p java9_ext= < "%TEMP%.\rtext.txt" + set java9_rt=!java9_ext!\rt.jar + + if not exist "!java9_rt!" ( + mkdir "!java9_ext!" + "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -jar "!rtexport!" "!java9_rt!" + ) + set _JAVA_OPTS=!_JAVA_OPTS! -Dscala.ext.dirs="!java9_ext!" + + rem check to see if a GC has been set in the opts + echo !_JAVA_OPTS! | findstr /r "Use.*GC" >nul + if ERRORLEVEL 1 ( + rem don't have a GC set - revert to old GC + set _JAVA_OPTS=!_JAVA_OPTS! -XX:+UseParallelGC + ) +) +exit /B 0 + +:sync_preloaded +if "%INIT_SBT_VERSION%"=="" ( + rem FIXME: better %INIT_SBT_VERSION% detection + FOR /F "tokens=* USEBACKQ" %%F IN (`dir /b "%SBT_HOME%\..\lib\local-preloaded\org.scala-sbt\sbt" /B`) DO ( + SET INIT_SBT_VERSION=%%F + ) +) +set PRELOAD_SBT_JAR="%UserProfile%\.sbt\preloaded\org.scala-sbt\sbt\%INIT_SBT_VERSION%\jars\sbt.jar" +if /I %JAVA_VERSION% GEQ 8 ( + where robocopy >nul 2>nul + if %ERRORLEVEL% equ 0 ( + REM echo %PRELOAD_SBT_JAR% + if not exist %PRELOAD_SBT_JAR% ( + if exist "%SBT_HOME%\..\lib\local-preloaded\" ( + echo "about to robocopy" + robocopy "%SBT_HOME%\..\lib\local-preloaded" "%UserProfile%\.sbt\preloaded" /E + ) + ) + ) +) +exit /B 0 + +:error +@endlocal +exit /B 1 + +:end +@endlocal +exit /B 0 diff --git a/src/scala/scala-stream/sbt-dist/conf/sbtconfig.txt b/src/scala/scala-stream/sbt-dist/conf/sbtconfig.txt new file mode 100644 index 0000000..a4da43e --- /dev/null +++ b/src/scala/scala-stream/sbt-dist/conf/sbtconfig.txt @@ -0,0 +1,14 @@ +# Set the java args to high + +-Xmx512M + +-XX:MaxPermSize=256m + +-XX:ReservedCodeCacheSize=128m + + + +# Set the extra SBT options + +-Dsbt.log.format=true + diff --git a/src/scala/scala-stream/sbt-dist/conf/sbtopts b/src/scala/scala-stream/sbt-dist/conf/sbtopts new file mode 100644 index 0000000..f018465 --- /dev/null +++ b/src/scala/scala-stream/sbt-dist/conf/sbtopts @@ -0,0 +1,49 @@ +# ------------------------------------------------ # +# The SBT Configuration file. # +# ------------------------------------------------ # + + +# Disable ANSI color codes +# +#-no-colors + +# Starts sbt even if the current directory contains no sbt project. +# +-sbt-create + +# Path to global settings/plugins directory (default: ~/.sbt) +# +#-sbt-dir /etc/sbt + +# Path to shared boot directory (default: ~/.sbt/boot in 0.11 series) +# +#-sbt-boot ~/.sbt/boot + +# Path to local Ivy repository (default: ~/.ivy2) +# +#-ivy ~/.ivy2 + +# set memory options +# +#-mem + +# Use local caches for projects, no sharing. +# +#-no-share + +# Put SBT in offline mode. +# +#-offline + +# Sets the SBT version to use. +#-sbt-version 0.11.3 + +# Scala version (default: latest release) +# +#-scala-home +#-scala-version + +# java version (default: java from PATH, currently $(java -version |& grep version)) +# +#-java-home + diff --git a/src/scala/scala-stream/src/main/scala/scalastream/J8SStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/J8SStream.scala new file mode 100644 index 0000000..ba509a5 --- /dev/null +++ b/src/scala/scala-stream/src/main/scala/scalastream/J8SStream.scala @@ -0,0 +1,44 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.reflect.{ClassTag, classTag} + +class J8SStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + inline private def stream = + java.util.stream.IntStream.range(0, config.options.arraysize).parallel() + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + stream.forEach { i => + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + } + + override inline def copy(): Unit = stream.forEach(i => c(i) = a(i)) + override inline def mul(): Unit = stream.forEach(i => b(i) = scalar * c(i)) + override inline def add(): Unit = stream.forEach(i => c(i) = a(i) + b(i)) + override inline def triad(): Unit = stream.forEach(i => a(i) = b(i) + scalar * c(i)) + override inline def nstream(): Unit = stream.forEach(i => a(i) = b(i) * scalar * c(i)) + override inline def dot(): A = + // horrible special-case for double, there isn't a mapToFloat so we give up on that + val cls = classTag[A].runtimeClass + if java.lang.Double.TYPE == cls then + stream + .mapToDouble(i => (a(i) * b(i)).asInstanceOf[Double]) + .reduce(0, (l: Double, r: Double) => l + r) + .asInstanceOf[A] + else stream.mapToObj[A](i => a(i) * b(i)).reduce(0.fractional, (l: A, r: A) => l + r) + + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ParStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ParStream.scala new file mode 100644 index 0000000..bb146a2 --- /dev/null +++ b/src/scala/scala-stream/src/main/scala/scalastream/ParStream.scala @@ -0,0 +1,36 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.collection.parallel.CollectionConverters._ +import scala.reflect.ClassTag +class ParStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + inline private def indices = (0 until config.options.arraysize).par + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + + for i <- indices do + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + + override inline def copy(): Unit = for i <- indices do c(i) = a(i) + override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) + override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) + override inline def triad(): Unit = for i <- indices do a(i) = b(i) + scalar * c(i) + override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) + override inline def dot(): A = + indices.aggregate[A](0.fractional)((acc, i) => acc + (a(i) * b(i)), _ + _) + + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/src/scala/scala-stream/src/main/scala/scalastream/PlainStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/PlainStream.scala new file mode 100644 index 0000000..2b42571 --- /dev/null +++ b/src/scala/scala-stream/src/main/scala/scalastream/PlainStream.scala @@ -0,0 +1,31 @@ +package scalastream + +import scalastream.App.{Config, Data} + +import scala.collection.immutable.ArraySeq +import scala.reflect.ClassTag +class PlainStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + override inline def initArrays(): Unit = + a = Array.fill(config.options.arraysize)(config.init._1) + b = Array.fill(config.options.arraysize)(config.init._2) + c = Array.fill(config.options.arraysize)(config.init._3) + + private inline def indices = 0 until config.options.arraysize + + override inline def copy(): Unit = for i <- indices do c(i) = a(i) + override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) + override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) + override inline def triad(): Unit = for i <- indices do a(i) = b(i) + (scalar * c(i)) + override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) + override inline def dot(): A = + var acc: A = 0.fractional + for i <- indices do acc = acc + (a(i) * b(i)) + acc + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala new file mode 100644 index 0000000..4ed90e4 --- /dev/null +++ b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala @@ -0,0 +1,369 @@ +package scalastream +import scalastream.App.{Config, Data, Timings} + +import java.util.concurrent.TimeUnit +import scala.collection.immutable.ArraySeq +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.duration.{Duration, FiniteDuration, SECONDS} +import scala.math.{Pi, pow} +import scala.reflect.ClassTag +import scopt.OParser + +transparent trait ScalaStream[@specialized(Float, Double) A]: + + def config: Config[A] + + def initArrays(): Unit + def copy(): Unit + def mul(): Unit + def add(): Unit + def triad(): Unit + def nstream(): Unit + def dot(): A + + transparent inline def timed[R](f: => R): (FiniteDuration, R) = + val start = System.nanoTime() + val r = f + val end = System.nanoTime() + FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r + + inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) = + val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val add = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val triad = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + val dot = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) + + var lastSum: A = 0.fractional + + for i <- 0 until times do + copy(i) = timed(this.copy())._1 + mul(i) = timed(this.mul())._1 + add(i) = timed(this.add())._1 + triad(i) = timed(this.triad())._1 + val (dot_, sum) = timed(this.dot()) + dot(i) = dot_ + lastSum = sum + val s = lastSum + + ( + Timings( + copy = copy.toVector, + mul = mul.toVector, + add = add.toVector, + triad = triad.toVector, + dot = dot.toVector + ), + s + ) + + def runTriad(times: Int): FiniteDuration = timed(for _ <- 0 until times do triad())._1 + def runNStream(times: Int): Vector[FiniteDuration] = Vector.fill(times)(timed(nstream())._1) + + def data(): Data[A] + + +trait Fractional[@specialized(Double, Float) A]: + def toFractional(f: Float): A + def toFractional(f: Double): A + def compare(x: A, y: A): Int + def add(x: A, y: A): A + def sub(x: A, y: A): A + def mul(x: A, y: A): A + def div(x: A, y: A): A + def abs(x: A): A + extension (x: Float) inline def fractional = toFractional(x) + extension (x: Double) inline def fractional = toFractional(x) + extension (x: Int) inline def fractional = toFractional(x.toFloat) + extension (x: Long) inline def fractional = toFractional(x.toDouble) + extension (x: A) + inline def +(y: A) = add(x, y) + inline def -(y: A) = sub(x, y) + inline def *(y: A) = mul(x, y) + inline def /(y: A) = div(x, y) + inline def >(y: A) = compare(x, y) > 0 + inline def <(y: A) = compare(x, y) < 0 + inline def abs_ = abs(x) +end Fractional + +given FloatFractional: Fractional[Float] with + inline def toFractional(f: Float): Float = f + inline def toFractional(f: Double): Float = f.toFloat + inline def compare(x: Float, y: Float): Int = x.compare(y) + inline def add(x: Float, y: Float): Float = x + y + inline def sub(x: Float, y: Float): Float = x - y + inline def mul(x: Float, y: Float): Float = x * y + inline def div(x: Float, y: Float): Float = x / y + inline def abs(x: Float): Float = math.abs(x) + +given DoubleFractional: Fractional[Double] with + inline def toFractional(f: Float): Double = f.toDouble + inline def toFractional(f: Double): Double = f + inline def compare(x: Double, y: Double): Int = x.compare(y) + inline def add(x: Double, y: Double): Double = x + y + inline def sub(x: Double, y: Double): Double = x - y + inline def mul(x: Double, y: Double): Double = x * y + inline def div(x: Double, y: Double): Double = x / y + inline def abs(x: Double): Double = math.abs(x) + +object App: + + final val Version: String = "3.4.0" + + case class Config[@specialized(Double, Float) A]( + options: Options, + benchmark: Benchmark, + typeSize: Int, + ulp: A, + scalar: A, + init: (A, A, A) + ) + + case class Timings[A](copy: A, mul: A, add: A, triad: A, dot: A) + case class Data[A](@specialized(Double, Float) a: ArraySeq[A], b: ArraySeq[A], c: ArraySeq[A]) + + case class Options( + list: Boolean = false, + device: Int = 0, + numtimes: Int = 100, + arraysize: Int = 33554432, + float: Boolean = false, + triad_only: Boolean = false, + nstream_only: Boolean = false, + csv: Boolean = false, + mibibytes: Boolean = false + ) + + object Options: + val Default = Options() + val builder = OParser.builder[Options] + val parser1 = + import builder._ + OParser.sequence( + programName("scala-stream"), + head("ScalaStream", s"$Version"), + opt[Unit]('l', "list").text("List available devices").action((_, x) => x.copy(list = true)), + opt[Int]('d', "device") + .text(s"Select device at , defaults to ${Default.device}") + .action((v, x) => x.copy(device = v)), + opt[Int]('n', "numtimes") + .text(s"Run the test times (NUM >= 2), defaults to ${Default.numtimes}") + .validate { + case n if n >= 2 => success + case n => failure(s"$n <= 2") + } + .action((n, x) => x.copy(numtimes = n)), + opt[Int]('a', "arraysize") + .text(s"Use elements in the array, defaults to ${Default.arraysize}") + .action((v, x) => x.copy(arraysize = v)), + opt[Unit]('f', "float") + .text("Use floats (rather than doubles)") + .action((_, x) => x.copy(float = true)), + opt[Unit]('t', "triad_only") + .text("Only run triad") + .action((_, x) => x.copy(triad_only = true)), + opt[Unit]('n', "nstream_only") + .text("Only run nstream") + .action((_, x) => x.copy(nstream_only = true)), + opt[Unit]('c', "csv").text("Output as csv table").action((_, x) => x.copy(csv = true)), + opt[Unit]('m', "mibibytes") + .text("Use MiB=2^20 for bandwidth calculation (default MB=10^6)") + .action((_, x) => x.copy(mibibytes = true)), + help('h', "help").text("prints this usage text") + ) + + enum Benchmark: + case All, NStream, Triad + + implicit class RichDuration(private val d: Duration) extends AnyVal: + def seconds: Double = d.toUnit(SECONDS) + + def validate[A: Fractional](vec: Data[A], config: Config[A], dotSum: Option[A] = None): Unit = + + var (goldA, goldB, goldC) = config.init + for _ <- 0 until config.options.numtimes do + config.benchmark match + case Benchmark.All => + goldC = goldA + goldB = config.scalar * goldC + goldC = goldA + goldB + goldA = goldB + config.scalar * goldC + case Benchmark.Triad => + goldA = goldB + config.scalar * goldC + case Benchmark.NStream => + goldA += goldB + config.scalar * goldC + + val tolerance = config.ulp * (100.fractional) + def validateXs(name: String, xs: Seq[A], from: A): Unit = + val error = xs.map(x => (x - from).abs_).fold(0.fractional)(_ + _) / xs.size.fractional + if error > tolerance then + Console.err.println(s"Validation failed on $name. Average error $error ") + + validateXs("a", vec.a, goldA) + validateXs("b", vec.b, goldB) + validateXs("c", vec.c, goldC) + + dotSum.foreach { sum => + val goldSum = (goldA * goldB) * (config.options.arraysize).fractional + val error = ((sum - goldSum) / goldSum).abs_ + if error > 1.fractional / 100000000.fractional then + Console.err.println( + s"Validation failed on sum. Error $error \nSum was $sum but should be $goldSum" + ) + } + + inline def run[A: Fractional: ClassTag]( + name: String, + config: Config[A], + mkStream: Config[A] => ScalaStream[A] + ): Unit = + + val opt = config.options + + val arrayBytes = opt.arraysize * config.typeSize + val totalBytes = arrayBytes * 3 + val (megaScale, megaSuffix, gigaScale, gigaSuffix) = + if !opt.mibibytes then (1.0e-6, "MB", 1.0e-9, "GB") + else (pow(2.0, -20), "MiB", pow(2.0, -30), "GiB") + + if !opt.csv then + + val vendor = System.getProperty("java.vendor") + val ver = System.getProperty("java.version") + val home = System.getProperty("java.home") + println( + s"""BabelStream + |Version: $Version + |Implementation: $name; Scala (Java $ver; $vendor; home=$home)""".stripMargin + ) + + println(s"Running ${config.benchmark match { + case Benchmark.All => "kernels" + case Benchmark.Triad => "triad" + case Benchmark.NStream => "nstream" + }} ${opt.numtimes} times") + + if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}") + + println(s"Precision: ${if opt.float then "float" else "double"}") + println( + f"Array size: ${megaScale * arrayBytes}%.1f $megaSuffix (=${gigaScale * arrayBytes}%.1f $gigaSuffix)" + ) + println( + f"Total size: ${megaScale * totalBytes}%.1f $megaSuffix (=${gigaScale * totalBytes}%.1f $gigaSuffix)" + ) + + def mkRow(xs: Vector[FiniteDuration], name: String, totalBytes: Int) = + val tail = xs.tail + (tail.minOption.map(_.seconds), tail.maxOption.map(_.seconds)) match + case (Some(min), Some(max)) => + val avg = (tail.foldLeft(Duration.Zero)(_ + _) / tail.size.toDouble).seconds + val mbps = megaScale * totalBytes.toDouble / min + if opt.csv then + Vector( + "function" -> name, + "num_times" -> opt.numtimes.toString, + "n_elements" -> opt.arraysize.toString, + "sizeof" -> totalBytes.toString, + s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" -> mbps.toString, + "min_runtime" -> min.toString, + "max_runtime" -> max.toString, + "avg_runtime" -> avg.toString + ) + else + Vector( + "Function" -> name, + s"M${if opt.mibibytes then "i" else ""}Bytes/sec" -> f"$mbps%.3f", + "Min (sec)" -> f"$min%.5f", + "Max" -> f"$max%.5f", + "Average" -> f"$avg%.5f" + ) + case (_, _) => sys.error(s"No min/max element for $name(size=$totalBytes)") + + def tabulate(rows: Vector[(String, String)]*): Unit = rows.toList match + case Nil => sys.error(s"Empty tabulation") + case header :: _ => + val padding = if opt.csv then 0 else 12 + val sep = if opt.csv then "," else "" + println(header.map(_._1.padTo(padding, ' ')).mkString(sep)) + println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n")) + + val stream = mkStream(config) + stream.initArrays() + config.benchmark match + case Benchmark.All => + val (results, sum) = stream.runAll(opt.numtimes) + validate(stream.data(), config, Some(sum)) + tabulate( + mkRow(results.copy, "Copy", 2 * arrayBytes), + mkRow(results.mul, "Mul", 2 * arrayBytes), + mkRow(results.add, "Add", 3 * arrayBytes), + mkRow(results.triad, "Triad", 3 * arrayBytes), + mkRow(results.dot, "Dot", 2 * arrayBytes) + ) + case Benchmark.NStream => + val result = stream.runNStream(opt.numtimes) + validate(stream.data(), config) + tabulate(mkRow(result, "Nstream", 4 * arrayBytes)) + case Benchmark.Triad => + val results = stream.runTriad(opt.numtimes) + val totalBytes = 3 * arrayBytes * opt.numtimes + val bandwidth = megaScale * (totalBytes / results.seconds) + println(f"Runtime (seconds): ${results.seconds}%.5f") + println(f"Bandwidth ($gigaSuffix/s): $bandwidth%.3f ") + + inline def devices[A: Fractional: ClassTag]: Vector[(String, Config[A] => ScalaStream[A])] = + Vector( + "Scala Parallel Collections" -> (ParStream(_)), + "Java 8 Stream" -> (J8SStream(_)), + "Threaded" -> (ThreadStream(_)), + "Serial" -> (PlainStream(_)) + ) + + inline def runWith[A: Fractional: ClassTag](i: Int, config: Config[A]): Unit = + devices[A].lift(i) match + case None => println(s"Device index out of bounds: $i") + case Some((name, mkStream)) => run(name, config, mkStream) + + def main(args: Array[String]): Unit = + + def handleOpt(opt: Options) = + val benchmark = (opt.nstream_only, opt.triad_only) match + case (true, false) => Benchmark.NStream + case (false, true) => Benchmark.Triad + case (false, false) => Benchmark.All + case (true, true) => + throw new RuntimeException( + "Both triad and nstream are enabled, pick one or omit both to run all benchmarks" + ) + + if opt.list then + devices[Float].zipWithIndex.foreach { case ((name, _), i) => println(s"$i: $name") } + else if opt.float then + runWith( + opt.device, + Config( + options = opt, + benchmark = benchmark, + typeSize = 4, // 32bit + ulp = math.ulp(Float.MaxValue), + scalar = 0.4f, + init = (0.1f, 0.2f, 0.0f) + ) + ) + else + runWith( + opt.device, + Config( + options = opt, + benchmark = benchmark, + typeSize = 8, + ulp = math.ulp(Double.MaxValue), + scalar = 0.4, // 64bit + init = (0.1, 0.2, 0.0) + ) + ) + + OParser.parse(Options.parser1, args, Options.Default) match + case Some(config) => handleOpt(config) + case _ => sys.exit(1) diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ThreadStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ThreadStream.scala new file mode 100644 index 0000000..969a71f --- /dev/null +++ b/src/scala/scala-stream/src/main/scala/scalastream/ThreadStream.scala @@ -0,0 +1,68 @@ +package scalastream + +import net.openhft.affinity.{AffinityStrategies, AffinityThreadFactory} +import scalastream.App.{Config, Data} + +import java.util.concurrent.{Callable, Executors} +import scala.collection.immutable.ArraySeq +import scala.reflect.ClassTag +object ThreadStream {} +class ThreadStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) + extends ScalaStream[A]: + + private var a: Array[A] = _ + private var b: Array[A] = _ + private var c: Array[A] = _ + private val scalar: A = config.scalar + + private val chunks: Int = sys.runtime.availableProcessors() + + private val pool = Executors.newFixedThreadPool( + chunks, + new AffinityThreadFactory("scala-stream", true, AffinityStrategies.DIFFERENT_CORE) + ) + + private val indices = (0 until config.options.arraysize) + .grouped(config.options.arraysize / chunks) + .toSeq + + private inline def forEachAll[C](c: => C)(f: (C, Int) => Unit): Seq[C] = + import scala.jdk.CollectionConverters._ + val xs = pool + .invokeAll( + indices.map { r => + { () => + val ctx = c + r.foreach(f(ctx, _)) + ctx + }: Callable[C] + }.asJavaCollection + ) + .asScala + .map(_.get()) + .toSeq + xs + + override inline def initArrays(): Unit = + a = Array.ofDim(config.options.arraysize) + b = Array.ofDim(config.options.arraysize) + c = Array.ofDim(config.options.arraysize) + forEachAll(()) { (_, i) => + a(i) = config.init._1 + b(i) = config.init._2 + c(i) = config.init._3 + } + () + + class Box(var value: A) + override inline def copy(): Unit = { forEachAll(())((_, i) => c(i) = a(i)); () } + override inline def mul(): Unit = { forEachAll(())((_, i) => b(i) = scalar * c(i)); () } + override inline def add(): Unit = { forEachAll(())((_, i) => c(i) = a(i) + b(i)); () } + override inline def triad(): Unit = { forEachAll(())((_, i) => a(i) = b(i) + scalar * c(i)); () } + override inline def nstream(): Unit = { forEachAll(())((_, i) => a(i) = b(i) * scalar * c(i)); () } + + override inline def dot(): A = + forEachAll(Box(0.fractional))((acc, i) => acc.value = acc.value + (a(i) * b(i))) + .map(_.value) + .fold(0.fractional)(_ + _) + override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) diff --git a/STDStream.cpp b/src/std/STDStream.cpp similarity index 100% rename from STDStream.cpp rename to src/std/STDStream.cpp diff --git a/STDStream.h b/src/std/STDStream.h similarity index 100% rename from STDStream.h rename to src/std/STDStream.h diff --git a/STD.cmake b/src/std/model.cmake similarity index 100% rename from STD.cmake rename to src/std/model.cmake diff --git a/STD20Stream.cpp b/src/std20/STD20Stream.cpp similarity index 100% rename from STD20Stream.cpp rename to src/std20/STD20Stream.cpp diff --git a/STD20Stream.hpp b/src/std20/STD20Stream.hpp similarity index 100% rename from STD20Stream.hpp rename to src/std20/STD20Stream.hpp diff --git a/STD20.cmake b/src/std20/model.cmake similarity index 100% rename from STD20.cmake rename to src/std20/model.cmake diff --git a/SYCLStream.cpp b/src/sycl/SYCLStream.cpp similarity index 100% rename from SYCLStream.cpp rename to src/sycl/SYCLStream.cpp diff --git a/SYCLStream.h b/src/sycl/SYCLStream.h similarity index 99% rename from SYCLStream.h rename to src/sycl/SYCLStream.h index f312009..d3fa18d 100644 --- a/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -10,7 +10,6 @@ #include #include "Stream.h" - #include "CL/sycl.hpp" #define IMPLEMENTATION_STRING "SYCL" diff --git a/SYCL.cmake b/src/sycl/model.cmake similarity index 96% rename from SYCL.cmake rename to src/sycl/model.cmake index c35f435..e7b5a1c 100644 --- a/SYCL.cmake +++ b/src/sycl/model.cmake @@ -47,8 +47,7 @@ macro(setup) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) - # don't point to the CL dir as the imports already have the CL prefix - set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/CL") + setup_opencl_header_includes() register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) # ComputeCpp needs OpenCL diff --git a/src/sycl2020/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp new file mode 100644 index 0000000..17a5ab5 --- /dev/null +++ b/src/sycl2020/SYCLStream2020.cpp @@ -0,0 +1,284 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream2020.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE}, + d_a {ARRAY_SIZE}, + d_b {ARRAY_SIZE}, + d_c {ARRAY_SIZE}, + d_sum {1} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor kb {d_b, cgh, sycl::write_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kb[idx] = scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx] + kb[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] += kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity{}), + [=](sycl::id<1> idx, auto& sum) + { + sum += ka[idx] * kb[idx]; + }); + + }); + + // Get access on the host, and return a copy of the data (single number) + // This will block until the result is available, so no need to wait on the queue. + sycl::host_accessor result {d_sum, sycl::read_only}; + return result[0]; + +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; + + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = initA; + kb[idx] = initB; + kc[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + sycl::host_accessor _a {d_a, sycl::read_only}; + sycl::host_accessor _b {d_b, sycl::read_only}; + sycl::host_accessor _c {d_c, sycl::read_only}; + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h new file mode 100644 index 0000000..7481d16 --- /dev/null +++ b/src/sycl2020/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL 2020" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + sycl::buffer d_a; + sycl::buffer d_b; + sycl::buffer d_c; + sycl::buffer d_sum; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake new file mode 100644 index 0000000..e7b5a1c --- /dev/null +++ b/src/sycl2020/model.cmake @@ -0,0 +1,86 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" + "c++") + +register_flag_required(SYCL_COMPILER + "Compile using the specified SYCL compiler implementation + Supported values are + ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") + +register_flag_optional(SYCL_COMPILER_DIR + "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: + ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + "") + +register_flag_optional(OpenCL_LIBRARY + "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" + "${OpenCL_LIBRARY}") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + + if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + + + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + + if (NOT EXISTS "${hipSYCL_DIR}") + message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${hipSYCL_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + endif () + + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(hipSYCL CONFIG REQUIRED) + message(STATUS "ok") + + elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") + + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) + + setup_opencl_header_includes() + + register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) + # ComputeCpp needs OpenCL + find_package(ComputeCpp REQUIRED) + + # this must come after FindComputeCpp (!) + set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) + + elseif (${SYCL_COMPILER} STREQUAL "DPCPP") + set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") + set(CMAKE_CXX_COMPILER dpcpp) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + else () + message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") + endif () + +endmacro() + + +macro(setup_target NAME) + if ( + (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR + (${SYCL_COMPILER} STREQUAL "HIPSYCL")) + # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + endif () +endmacro() diff --git a/TBBStream.cpp b/src/tbb/TBBStream.cpp similarity index 100% rename from TBBStream.cpp rename to src/tbb/TBBStream.cpp diff --git a/TBBStream.hpp b/src/tbb/TBBStream.hpp similarity index 100% rename from TBBStream.hpp rename to src/tbb/TBBStream.hpp diff --git a/TBB.cmake b/src/tbb/model.cmake similarity index 100% rename from TBB.cmake rename to src/tbb/model.cmake diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu new file mode 100644 index 0000000..3a57ab0 --- /dev/null +++ b/src/thrust/ThrustStream.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "ThrustStream.h" +#include +#include +#include +#include + +static inline void synchronise() +{ +// rocThrust doesn't synchronise between thrust calls +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + hipDeviceSynchronize(); +#endif +} + +template +ThrustStream::ThrustStream(const int ARRAY_SIZE, int device) + : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { + std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device) << std::endl; + std::cout << "Thrust version: " << THRUST_VERSION << std::endl; + +#if THRUST_DEVICE_SYSTEM == 0 + // as per Thrust docs, 0 is reserved for undefined backend + std::cout << "Thrust backend: undefined" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + std::cout << "Thrust backend: CUDA" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP + std::cout << "Thrust backend: OMP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP + std::cout << "Thrust backend: CPP" << std::endl; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + std::cout << "Thrust backend: TBB" << std::endl; +#else + +#if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP + std::cout << "Thrust backend: HIP" << std::endl; +#else + std::cout << "Thrust backend: " << THRUST_DEVICE_SYSTEM << "(unknown)" << std::endl; +#endif + +#endif + +} + +template +void ThrustStream::init_arrays(T initA, T initB, T initC) +{ + thrust::fill(a.begin(), a.end(), initA); + thrust::fill(b.begin(), b.end(), initB); + thrust::fill(c.begin(), c.end(), initC); + synchronise(); +} + +template +void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + thrust::copy(a.begin(), a.end(), h_a.begin()); + thrust::copy(b.begin(), b.end(), h_b.begin()); + thrust::copy(c.begin(), c.end(), h_c.begin()); +} + +template +void ThrustStream::copy() +{ + thrust::copy(a.begin(), a.end(),c.begin()); + synchronise(); +} + +template +void ThrustStream::mul() +{ + const T scalar = startScalar; + thrust::transform( + c.begin(), + c.end(), + b.begin(), + [=] __device__ __host__ (const T &ci){ + return ci * scalar; + } + ); + synchronise(); +} + +template +void ThrustStream::add() +{ + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())), + c.begin(), + thrust::make_zip_function( + [] __device__ __host__ (const T& ai, const T& bi){ + return ai + bi; + }) + ); + synchronise(); +} + +template +void ThrustStream::triad() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& bi, const T& ci){ + return bi + scalar * ci; + }) + ); + synchronise(); +} + +template +void ThrustStream::nstream() +{ + const T scalar = startScalar; + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), + a.begin(), + thrust::make_zip_function( + [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ + return ai + bi + scalar * ci; + }) + ); + synchronise(); +} + +template +T ThrustStream::dot() +{ + return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); +} + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ + (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) + +#ifdef __NVCC__ +#define IMPL_FN__(fn) cuda ## fn +#define IMPL_TYPE__(tpe) cuda ## tpe +#elif defined(__HIP_PLATFORM_HCC__) +#define IMPL_FN__(fn) hip ## fn +#define IMPL_TYPE__(tpe) hip ## tpe ## _t +#else +# error Unsupported compiler for Thrust +#endif + +void check_error(void) +{ + IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); + if (err != IMPL_FN__(Success)) + { + std::cerr << "Error: " << IMPL_FN__(GetErrorString(err)) << std::endl; + exit(err); + } +} + +void listDevices(void) +{ + // Get number of devices + int count; + IMPL_FN__(GetDeviceCount(&count)); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + IMPL_TYPE__(DeviceProp) props = {}; + IMPL_FN__(GetDeviceProperties(&props, device)); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + IMPL_FN__(SetDevice(device)); + check_error(); + int driver; + IMPL_FN__(DriverGetVersion(&driver)); + check_error(); + return std::to_string(driver); +} + +#undef IMPL_FN__ +#undef IMPL_TPE__ + +#else + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("(device name unavailable)"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("(device driver unavailable)"); +} + +#endif + +template class ThrustStream; +template class ThrustStream; + diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h new file mode 100644 index 0000000..f87ace7 --- /dev/null +++ b/src/thrust/ThrustStream.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 Tom Deakin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Thrust" + +template +class ThrustStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + thrust::device_vector a; + thrust::device_vector b; + thrust::device_vector c; + + public: + ThrustStream(const int, int); + ~ThrustStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake new file mode 100644 index 0000000..0c286c2 --- /dev/null +++ b/src/thrust/model.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(THRUST_IMPL + "Which Thrust implementation to use, supported options include: + - CUDA (via https://github.com/NVIDIA/thrust) + - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust) + " + "CUDA") + +register_flag_optional(SDK_DIR + "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)" + "") + +register_flag_optional(BACKEND + "[THRUST_IMPL==CUDA] CUDA's Thrust implementation supports the following backends: + - CUDA + - OMP + - TBB + " + "CUDA") + +register_flag_optional(CMAKE_CUDA_COMPILER + "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" + "") + +# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly +register_flag_optional(CUDA_ARCH + "[THRUST_IMPL==CUDA] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc" + "") + +register_flag_optional(CUDA_EXTRA_FLAGS + "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" + "") + + +macro(setup) + set(CMAKE_CXX_STANDARD 14) + + if (${THRUST_IMPL} STREQUAL "CUDA") + + # see CUDA.cmake, we're only adding a few Thrust related libraries here + + if (POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif () + + # add -forward-unknown-to-host-compiler for compatibility reasons + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + enable_language(CUDA) + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG + # appended later + wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) + + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + + + if (SDK_DIR) + find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) + find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) + else () + find_package(CUB REQUIRED CONFIG) + find_package(Thrust REQUIRED CONFIG) + endif () + + message(STATUS "Using Thrust backend: ${BACKEND}") + + # this creates the interface that we can link to + thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + + register_link_library(Thrust) + elseif (${THRUST_IMPL} STREQUAL "ROCM") + if (SDK_DIR) + find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) + find_package(rocthrust REQUIRED CONFIG PATHS ${SDK_DIR}/rocthrust) + else () + find_package(rocprim REQUIRED CONFIG) + find_package(rocthrust REQUIRED CONFIG) + endif () + + # for HIP we treat *.cu files as CXX otherwise CMake doesn't compile them + set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CXX) + + register_link_library(roc::rocthrust) + else () + message(FATAL_ERROR "Unsupported THRUST_IMPL provided: ${THRUST_IMPL}") + endif () + + +endmacro() + + + \ No newline at end of file