Merge branch 'main' into std-next

This commit is contained in:
Tom Lin 2021-12-15 20:18:09 +00:00
commit 881c0cc706
172 changed files with 11423 additions and 8522 deletions

View File

@ -1,98 +1,170 @@
name: CI
on: [push, pull_request]
on:
push:
pull_request:
workflow_dispatch:
inputs:
debug_enabled:
description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
required: false
default: false
jobs:
test:
test-rust:
runs-on: ubuntu-18.04
defaults:
run:
working-directory: ./src/rust/rust-stream
steps:
- uses: actions/checkout@v2
- name: Setup project
run: rustup install nightly
- name: Compile project
run: cargo +nightly build --release
- name: Test project
run: cargo +nightly test --release
- name: Test run project
run: ./target/release/rust-stream --arraysize 2048
test-java:
runs-on: ubuntu-18.04
defaults:
run:
working-directory: ./src/java/java-stream
steps:
- uses: actions/checkout@v2
- name: Test build project
run: ./mvnw clean package
- name: Test run
if: ${{ ! cancelled() }}
run: java -jar target/java-stream.jar --arraysize 2048
test-julia:
runs-on: ubuntu-18.04
defaults:
run:
working-directory: ./src/julia/JuliaStream.jl
steps:
- uses: actions/checkout@v2
- name: Setup project
run: julia --project -e 'import Pkg; Pkg.instantiate()'
- name: Test run PlainStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/PlainStream.jl --arraysize 2048
- name: Test run ThreadedStream.jl
if: ${{ ! cancelled() }}
run: julia --threads 2 --project src/ThreadedStream.jl --arraysize 2048
- name: Test run DistributedStream.jl (no flag)
if: ${{ ! cancelled() }}
run: julia --project src/DistributedStream.jl --arraysize 2048
- name: Test run DistributedStream.jl (-p 2)
if: ${{ ! cancelled() }}
run: julia -p 2 --project src/DistributedStream.jl --arraysize 2048
- name: Test run CUDAStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/CUDAStream.jl --list
- name: Test run AMDGPUStream.jl
if: ${{ ! cancelled() }}
run: julia --project src/AMDGPUStream.jl --list
test-cpp:
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2
- name: Cache compiler
if: ${{ !env.ACT }}
id: prepare-compilers
uses: actions/cache@v2
with:
path: compilers
key: ${{ runner.os }}-${{ hashFiles('ci-prepare-bionic.sh') }}
path: ./compilers
key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }}
- name: Prepare compilers
if: steps.prepare-compilers.outputs.cache-hit != 'true'
run: source ./ci-prepare-bionic.sh ./compilers SETUP true || true
run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true
- name: Setup test environment
run: source ./ci-prepare-bionic.sh ./compilers VARS false || true
run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true
# Enable tmate debugging of manually-triggered workflows if the input option was provided
- name: Setup tmate session
uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
- name: Test compile gcc @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile clang @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile nvhpc @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile aocc @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile aomp @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile hip @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile dpcpp @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile hipsycl @ CMake 3.13
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_13_BIN }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_13_BIN }}
- name: Test compile gcc @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile clang @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile nvhpc @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile aocc @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile aomp @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile hip @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile dpcpp @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile hipsycl @ CMake 3.15
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_15_BIN }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_15_BIN }}
- name: Test compile gcc @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile clang @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile nvhpc @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile aocc @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile aomp @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile hip @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile dpcpp @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }}
- name: Test compile hipsycl @ CMake 3.18
if: ${{ ! cancelled() }}
run: ./ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}
run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }}

2
.gitignore vendored
View File

@ -28,4 +28,4 @@ cmake-build-*/
CMakeFiles/
.idea/
.vscode/
.directory
.directory

View File

@ -14,7 +14,7 @@ All notable changes to this project will be documented in this file.
- Added nstream kernel from PRK with associate command line option.
- CMake build system added for all models.
- SYCL device check for FP64 support.
- New implementation using TBB.
- New implementations: TBB, Thrust, Julia, Scala, Java.
- Compiler options for Fujitsu added to OpenMP.
### Changed
@ -33,12 +33,14 @@ All notable changes to this project will be documented in this file.
- Normalise sum result by expected value to help false negative errors.
- HC version deprecated and moved to a legacy directory.
- Update RAJA to v0.13.0 (w/ code changes as this is a source incompatible update).
- Update SYCL version to SYCL 2020.
### Removed
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
Pre-building kernels is also not required, and shows no overhead as the first iteration is not timed.
- OpenACC Cray compiler flags.
- Build support for Kokkos 2.x (No code changes made).
- All Makefiles; build system will now use CMake exclusively.
## [v3.4] - 2019-04-10

1902
CL/cl.h

File diff suppressed because it is too large Load Diff

View File

@ -1,117 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D10_H
#define __OPENCL_CL_D3D10_H
#include <d3d10.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d10_sharing */
#define cl_khr_d3d10_sharing 1
typedef cl_uint cl_d3d10_device_source_khr;
typedef cl_uint cl_d3d10_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D10_DEVICE_KHR -1002
#define CL_INVALID_D3D10_RESOURCE_KHR -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
/* cl_d3d10_device_source_nv */
#define CL_D3D10_DEVICE_KHR 0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
/* cl_d3d10_device_set_nv */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
cl_platform_id platform,
cl_d3d10_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D10_H */

View File

@ -1,117 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D11_H
#define __OPENCL_CL_D3D11_H
#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d11_sharing */
#define cl_khr_d3d11_sharing 1
typedef cl_uint cl_d3d11_device_source_khr;
typedef cl_uint cl_d3d11_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D11_DEVICE_KHR -1006
#define CL_INVALID_D3D11_RESOURCE_KHR -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
/* cl_d3d11_device_source */
#define CL_D3D11_DEVICE_KHR 0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
/* cl_d3d11_device_set */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
cl_platform_id platform,
cl_d3d11_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D11_H */

View File

@ -1,118 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
/* cl_khr_dx9_media_sharing */
#define cl_khr_dx9_media_sharing 1
typedef cl_uint cl_dx9_media_adapter_type_khr;
typedef cl_uint cl_dx9_media_adapter_set_khr;
#if defined(_WIN32)
#include <d3d9.h>
typedef struct _cl_dx9_surface_info_khr
{
IDirect3DSurface9 *resource;
HANDLE shared_handle;
} cl_dx9_surface_info_khr;
#endif
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR 0x2020
#define CL_ADAPTER_D3D9EX_KHR 0x2021
#define CL_ADAPTER_DXVA_KHR 0x2022
/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
cl_platform_id platform,
cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr * media_adapter_type,
void * media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
cl_context context,
cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type,
void * surface_info,
cl_uint plane,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */

View File

@ -1,170 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_dx9_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <d3d9.h>
#include <dxvahd.h>
#include <wtypes.h>
#include <d3d9types.h>
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_dx9_media_sharing extension *
****************************************/
#define cl_intel_dx9_media_sharing 1
typedef cl_uint cl_dx9_device_source_intel;
typedef cl_uint cl_dx9_device_set_intel;
/* error codes */
#define CL_INVALID_DX9_DEVICE_INTEL -1010
#define CL_INVALID_DX9_RESOURCE_INTEL -1011
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013
/* cl_dx9_device_source_intel */
#define CL_D3D9_DEVICE_INTEL 0x4022
#define CL_D3D9EX_DEVICE_INTEL 0x4070
#define CL_DXVA_DEVICE_INTEL 0x4071
/* cl_dx9_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024
#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025
/* cl_context_info */
#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072
#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073
/* cl_mem_info */
#define CL_MEM_DX9_RESOURCE_INTEL 0x4027
#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074
/* cl_image_info */
#define CL_IMAGE_DX9_PLANE_INTEL 0x4075
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B
/******************************************************************************/
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */

View File

@ -1,120 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_EGL_H
#define __OPENCL_CL_EGL_H
#include <CL/cl.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void* CLeglImageKHR;
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void* CLeglDisplayKHR;
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void* CLeglSyncKHR;
/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t cl_egl_image_properties_khr;
#define cl_khr_egl_image 1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
#define cl_khr_egl_event 1
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_EGL_H */

View File

@ -1,841 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies. */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
/* cl_khr_fp64 extension - no extension #define since it has no functions */
/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
#if CL_TARGET_OPENCL_VERSION <= 110
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
#endif
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* Memory object destruction
*
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
*
* Registers a user callback function that will be called when the memory object is deleted and its resources
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
* stack associated with memobj. The registered user callback functions are called in the reverse order in
* which they were registered. The user callback functions are called and then the memory object is deleted
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
* the storage bits for the memory object, can be reused or freed.
*
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
*
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*/
#define cl_APPLE_SetMemObjectDestructor 1
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj,
void (* pfn_notify)(cl_mem memobj, void * user_data),
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* Context Logging Functions
*
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*
* clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
*/
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
/*******************************
* cl_khr_il_program extension *
*******************************/
#define cl_khr_il_program 1
/* New property to clGetDeviceInfo for retrieving supported intermediate
* languages
*/
#define CL_DEVICE_IL_VERSION_KHR 0x105B
/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
* program
*/
#define CL_PROGRAM_IL_KHR 0x1169
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithILKHR(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret);
typedef CL_API_ENTRY cl_program
(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/* Extension: cl_khr_image2d_from_buffer
*
* This extension allows a 2D image to be created from a cl_mem buffer without
* a copy. The type associated with a 2D image created from a buffer in an
* OpenCL program is image2d_t. Both the sampler and sampler-less read_image
* built-in functions are supported for 2D images and 2D images created from
* a buffer. Similarly, the write_image built-ins are also supported for 2D
* images created from a buffer.
*
* When the 2D image from buffer is created, the client must specify the
* width, height, image format (i.e. channel order and channel data type)
* and optionally the row pitch.
*
* The pitch specified must be a multiple of
* CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
* The base address of the buffer must be aligned to
* CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
*/
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B
/**************************************
* cl_khr_initialize_memory extension *
**************************************/
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030
/**************************************
* cl_khr_terminate_context extension *
**************************************/
#define CL_CONTEXT_TERMINATED_KHR -1121
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031
#define CL_CONTEXT_TERMINATE_KHR 0x2032
#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL
clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
/*
* Extension: cl_khr_spir
*
* This extension adds support to create an OpenCL program object from a
* Standard Portable Intermediate Representation (SPIR) instance
*/
#define CL_DEVICE_SPIR_VERSIONS 0x40E0
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
/*****************************************
* cl_khr_create_command_queue extension *
*****************************************/
#define cl_khr_create_command_queue 1
typedef cl_bitfield cl_queue_properties_khr;
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_command_queue
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_nv_device_attribute_query extension *
******************************************/
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
/*********************************
* cl_amd_device_attribute_query *
*********************************/
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
#define CL_DEVICE_TOPOLOGY_AMD 0x4037
#define CL_DEVICE_BOARD_NAME_AMD 0x4038
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049
#define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A
#define CL_DEVICE_GFXIP_MINOR_AMD 0x404B
#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030
#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031
#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033
#define CL_DEVICE_PCIE_ID_AMD 0x4034
/*********************************
* cl_arm_printf extension
*********************************/
#define CL_PRINTF_CALLBACK_ARM 0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
/***********************************
* cl_ext_device_fission extension
***********************************/
#define cl_ext_device_fission 1
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef cl_ulong cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
#define CL_INVALID_PARTITION_COUNT_EXT -1058
#define CL_INVALID_PARTITION_NAME_EXT -1059
/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
/***********************************
* cl_ext_migrate_memobject extension definitions
***********************************/
#define cl_ext_migrate_memobject 1
typedef cl_bitfield cl_mem_migration_flags_ext;
#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1
#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
/*********************************
* cl_qcom_ext_host_ptr extension
*********************************/
#define cl_qcom_ext_host_ptr 1
#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
typedef cl_uint cl_image_pitch_info_qcom;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(cl_device_id device,
size_t image_width,
size_t image_height,
const cl_image_format *image_format,
cl_image_pitch_info_qcom param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
typedef struct _cl_mem_ext_host_ptr
{
/* Type of external memory allocation. */
/* Legal values will be defined in layered extensions. */
cl_uint allocation_type;
/* Host cache policy for this external memory allocation. */
cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;
/*******************************************
* cl_qcom_ext_host_ptr_iocoherent extension
********************************************/
/* Cache policy specifying io-coherence */
#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9
/*********************************
* cl_qcom_ion_host_ptr extension
*********************************/
#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
typedef struct _cl_mem_ion_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
cl_mem_ext_host_ptr ext_host_ptr;
/* ION file descriptor */
int ion_filedesc;
/* Host pointer to the ION allocated memory */
void* ion_hostptr;
} cl_mem_ion_host_ptr;
/*********************************
* cl_qcom_android_native_buffer_host_ptr extension
*********************************/
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6
typedef struct _cl_mem_android_native_buffer_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
cl_mem_ext_host_ptr ext_host_ptr;
/* Virtual pointer to the android native buffer */
void* anb_ptr;
} cl_mem_android_native_buffer_host_ptr;
/******************************************
* cl_img_yuv_image extension *
******************************************/
/* Image formats used in clCreateImage */
#define CL_NV21_IMG 0x40D0
#define CL_YV12_IMG 0x40D1
/******************************************
* cl_img_cached_allocations extension *
******************************************/
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26)
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27)
/******************************************
* cl_img_use_gralloc_ptr extension *
******************************************/
#define cl_img_use_gralloc_ptr 1
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28)
/* To be used by clGetEventInfo: */
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3
/* Error code from clEnqueueReleaseGrallocObjectsIMG */
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
/*********************************
* cl_khr_subgroups extension
*********************************/
#define cl_khr_subgroups 1
#if !defined(CL_VERSION_2_1)
/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
In hindsight, there should have been a khr suffix on this type for
the extension, but keeping it un-suffixed to maintain backwards
compatibility. */
typedef cl_uint cl_kernel_sub_group_info;
#endif
/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
/*********************************
* cl_khr_mipmap_image extension
*********************************/
/* cl_sampler_properties */
#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155
#define CL_SAMPLER_LOD_MIN_KHR 0x1156
#define CL_SAMPLER_LOD_MAX_KHR 0x1157
/*********************************
* cl_khr_priority_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_priority_hints 1
typedef cl_uint cl_queue_priority_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_PRIORITY_KHR 0x1096
/* cl_queue_priority_khr */
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
/*********************************
* cl_khr_throttle_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_throttle_hints 1
typedef cl_uint cl_queue_throttle_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_THROTTLE_KHR 0x1097
/* cl_queue_throttle_khr */
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
/*********************************
* cl_khr_subgroup_named_barrier
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_subgroup_named_barrier 1
/* cl_device_info */
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035
/*********************************
* cl_khr_extended_versioning
*********************************/
#define cl_khr_extended_versioning 1
#define CL_VERSION_MAJOR_BITS_KHR (10)
#define CL_VERSION_MINOR_BITS_KHR (10)
#define CL_VERSION_PATCH_BITS_KHR (12)
#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
#define CL_MAKE_VERSION_KHR(major, minor, patch) \
((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
(((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
((patch) & CL_VERSION_PATCH_MASK_KHR))
typedef cl_uint cl_version_khr;
#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
typedef struct _cl_name_version_khr
{
cl_version_khr version;
char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
} cl_name_version_khr;
/* cl_platform_info */
#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907
/* cl_device_info */
#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E
#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F
#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060
#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062
/*********************************
* cl_khr_device_uuid extension
*********************************/
#define cl_khr_device_uuid 1
#define CL_UUID_SIZE_KHR 16
#define CL_LUID_SIZE_KHR 8
#define CL_DEVICE_UUID_KHR 0x106A
#define CL_DRIVER_UUID_KHR 0x106B
#define CL_DEVICE_LUID_VALID_KHR 0x106C
#define CL_DEVICE_LUID_KHR 0x106D
#define CL_DEVICE_NODE_MASK_KHR 0x106E
/**********************************
* cl_arm_import_memory extension *
**********************************/
#define cl_arm_import_memory 1
typedef intptr_t cl_import_properties_arm;
/* Default and valid proporties name for cl_arm_import_memory */
#define CL_IMPORT_TYPE_ARM 0x40B2
/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_HOST_ARM 0x40B3
/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4
/* Protected memory property */
#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5
/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
/* Data consistency with host property */
#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
/* Import memory size value to indicate a size for the whole buffer */
#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
/* This extension adds a new function that allows for direct memory import into
* OpenCL via the clImportMemoryARM function.
*
* Memory imported through this interface will be mapped into the device's page
* tables directly, providing zero copy access. It will never fall back to copy
* operations and aliased buffers.
*
* Types of memory supported for import are specified as additional extension
* strings.
*
* This extension produces cl_mem allocations which are compatible with all other
* users of cl_mem in the standard API.
*
* This extension maps pages with the same properties as the normal buffer creation
* function clCreateBuffer.
*/
extern CL_API_ENTRY cl_mem CL_API_CALL
clImportMemoryARM( cl_context context,
cl_mem_flags flags,
const cl_import_properties_arm *properties,
void *memory,
size_t size,
cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
/******************************************
* cl_arm_shared_virtual_memory extension *
******************************************/
#define cl_arm_shared_virtual_memory 1
/* Used by clGetDeviceInfo */
#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6
/* Used by clGetMemObjectInfo */
#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7
/* Used by clSetKernelExecInfoARM: */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9
/* To be used by clGetEventInfo: */
#define CL_COMMAND_SVM_FREE_ARM 0x40BA
#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB
#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC
#define CL_COMMAND_SVM_MAP_ARM 0x40BD
#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE
/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2)
#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3)
/* Flag values used by clSVMAllocARM: */
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10)
#define CL_MEM_SVM_ATOMICS_ARM (1 << 11)
typedef cl_bitfield cl_svm_mem_flags_arm;
typedef cl_uint cl_kernel_exec_info_arm;
typedef cl_bitfield cl_device_svm_capabilities_arm;
extern CL_API_ENTRY void * CL_API_CALL
clSVMAllocARM(cl_context context,
cl_svm_mem_flags_arm flags,
size_t size,
cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY void CL_API_CALL
clSVMFreeARM(cl_context context,
void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFreeARM(cl_command_queue command_queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void * user_data),
void * user_data,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpyARM(cl_command_queue command_queue,
cl_bool blocking_copy,
void * dst_ptr,
const void * src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFillARM(cl_command_queue command_queue,
void * svm_ptr,
const void * pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMapARM(cl_command_queue command_queue,
cl_bool blocking_map,
cl_map_flags flags,
void * svm_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmapARM(cl_command_queue command_queue,
void * svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointerARM(cl_kernel kernel,
cl_uint arg_index,
const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfoARM(cl_kernel kernel,
cl_kernel_exec_info_arm param_name,
size_t param_value_size,
const void * param_value) CL_EXT_SUFFIX__VERSION_1_2;
/********************************
* cl_arm_get_core_id extension *
********************************/
#ifdef CL_VERSION_1_2
#define cl_arm_get_core_id 1
/* Device info property for bitfield of cores present */
#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF
#endif /* CL_VERSION_1_2 */
/*********************************
* cl_arm_job_slot_selection
*********************************/
#define cl_arm_job_slot_selection 1
/* cl_device_info */
#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0
/* cl_command_queue_properties */
#define CL_QUEUE_JOB_SLOT_ARM 0x41E1
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */

View File

@ -1,682 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2020 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_ext_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __CL_EXT_INTEL_H
#define __CL_EXT_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_thread_local_exec extension *
****************************************/
#define cl_intel_thread_local_exec 1
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31)
/***********************************************
* cl_intel_device_partition_by_names extension *
************************************************/
#define cl_intel_device_partition_by_names 1
#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1
/************************************************
* cl_intel_accelerator extension *
* cl_intel_motion_estimation extension *
* cl_intel_advanced_motion_estimation extension *
*************************************************/
#define cl_intel_accelerator 1
#define cl_intel_motion_estimation 1
#define cl_intel_advanced_motion_estimation 1
typedef struct _cl_accelerator_intel* cl_accelerator_intel;
typedef cl_uint cl_accelerator_type_intel;
typedef cl_uint cl_accelerator_info_intel;
typedef struct _cl_motion_estimation_desc_intel {
cl_uint mb_block_type;
cl_uint subpixel_mode;
cl_uint sad_adjust_mode;
cl_uint search_path_type;
} cl_motion_estimation_desc_intel;
/* error codes */
#define CL_INVALID_ACCELERATOR_INTEL -1094
#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
/* cl_accelerator_type_intel */
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0
/* cl_accelerator_info_intel */
#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091
#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092
#define CL_ACCELERATOR_TYPE_INTEL 0x4093
/* cl_motion_detect_desc_intel flags */
#define CL_ME_MB_TYPE_16x16_INTEL 0x0
#define CL_ME_MB_TYPE_8x8_INTEL 0x1
#define CL_ME_MB_TYPE_4x4_INTEL 0x2
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5
#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4
#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1
#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3
#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48
#define CL_ME_COST_PENALTY_NONE_INTEL 0x0
#define CL_ME_COST_PENALTY_LOW_INTEL 0x1
#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2
#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3
#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
/* cl_device_info */
#define CL_DEVICE_ME_VERSION_INTEL 0x407E
#define CL_ME_VERSION_LEGACY_INTEL 0x0
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2
extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetAcceleratorInfoINTEL(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_intel_simultaneous_sharing extension *
*******************************************/
#define cl_intel_simultaneous_sharing 1
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105
/***********************************
* cl_intel_egl_image_yuv extension *
************************************/
#define cl_intel_egl_image_yuv 1
#define CL_EGL_YUV_PLANE_INTEL 0x4107
/********************************
* cl_intel_packed_yuv extension *
*********************************/
#define cl_intel_packed_yuv 1
#define CL_YUYV_INTEL 0x4076
#define CL_UYVY_INTEL 0x4077
#define CL_YVYU_INTEL 0x4078
#define CL_VYUY_INTEL 0x4079
/********************************************
* cl_intel_required_subgroup_size extension *
*********************************************/
#define cl_intel_required_subgroup_size 1
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
/****************************************
* cl_intel_driver_diagnostics extension *
*****************************************/
#define cl_intel_driver_diagnostics 1
typedef cl_uint cl_diagnostics_verbose_level;
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 )
/********************************
* cl_intel_planar_yuv extension *
*********************************/
#define CL_NV12_INTEL 0x410E
#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 )
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 )
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F
/*******************************************************
* cl_intel_device_side_avc_motion_estimation extension *
********************************************************/
#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D
#define CL_AVC_ME_VERSION_0_INTEL 0x0 /* No support. */
#define CL_AVC_ME_VERSION_1_INTEL 0x1 /* First supported version. */
#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CL_AVC_ME_MINOR_8x8_INTEL 0x0
#define CL_AVC_ME_MINOR_8x4_INTEL 0x1
#define CL_AVC_ME_MINOR_4x8_INTEL 0x2
#define CL_AVC_ME_MINOR_4x4_INTEL 0x3
#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 )
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CL_AVC_ME_INTRA_16x16_INTEL 0x0
#define CL_AVC_ME_INTRA_8x8_INTEL 0x1
#define CL_AVC_ME_INTRA_4x4_INTEL 0x2
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
/*******************************************
* cl_intel_unified_shared_memory extension *
********************************************/
/* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */
#define cl_intel_unified_shared_memory 1
/* cl_device_info */
#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190
#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191
#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192
#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193
#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194
typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0)
#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
typedef cl_bitfield cl_mem_properties_intel;
/* cl_mem_properties_intel */
#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195
typedef cl_bitfield cl_mem_alloc_flags_intel;
/* cl_mem_alloc_flags_intel - bitfield */
#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0)
typedef cl_uint cl_mem_info_intel;
/* cl_mem_alloc_info_intel */
#define CL_MEM_ALLOC_TYPE_INTEL 0x419A
#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B
#define CL_MEM_ALLOC_SIZE_INTEL 0x419C
#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D
/* Enum values 0x419E-0x419F are reserved for future queries. */
typedef cl_uint cl_unified_shared_memory_type_intel;
/* cl_unified_shared_memory_type_intel */
#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196
#define CL_MEM_TYPE_HOST_INTEL 0x4197
#define CL_MEM_TYPE_DEVICE_INTEL 0x4198
#define CL_MEM_TYPE_SHARED_INTEL 0x4199
typedef cl_uint cl_mem_advice_intel;
/* cl_mem_advice_intel */
/* Enum values 0x4208-0x420F are reserved for future memory advices. */
/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200
#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201
#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202
#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203
/* cl_command_type */
#define CL_COMMAND_MEMFILL_INTEL 0x4204
#define CL_COMMAND_MEMCPY_INTEL 0x4205
#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206
#define CL_COMMAND_MEMADVISE_INTEL 0x4207
extern CL_API_ENTRY void* CL_API_CALL
clHostMemAllocINTEL(
cl_context context,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clHostMemAllocINTEL_fn)(
cl_context context,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY void* CL_API_CALL
clDeviceMemAllocINTEL(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clDeviceMemAllocINTEL_fn)(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY void* CL_API_CALL
clSharedMemAllocINTEL(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clSharedMemAllocINTEL_fn)(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clMemFreeINTEL(
cl_context context,
void* ptr);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clMemFreeINTEL_fn)(
cl_context context,
void* ptr);
extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemAllocInfoINTEL(
cl_context context,
const void* ptr,
cl_mem_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clGetMemAllocInfoINTEL_fn)(
cl_context context,
const void* ptr,
cl_mem_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgMemPointerINTEL(
cl_kernel kernel,
cl_uint arg_index,
const void* arg_value);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clSetKernelArgMemPointerINTEL_fn)(
cl_kernel kernel,
cl_uint arg_index,
const void* arg_value);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemsetINTEL( /* Deprecated */
cl_command_queue command_queue,
void* dst_ptr,
cl_int value,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemsetINTEL_fn)( /* Deprecated */
cl_command_queue command_queue,
void* dst_ptr,
cl_int value,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemFillINTEL(
cl_command_queue command_queue,
void* dst_ptr,
const void* pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemFillINTEL_fn)(
cl_command_queue command_queue,
void* dst_ptr,
const void* pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemcpyINTEL(
cl_command_queue command_queue,
cl_bool blocking,
void* dst_ptr,
const void* src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemcpyINTEL_fn)(
cl_command_queue command_queue,
cl_bool blocking,
void* dst_ptr,
const void* src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#ifdef CL_VERSION_1_2
/* Because these APIs use cl_mem_migration_flags, they require
OpenCL 1.2: */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemINTEL(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMigrateMemINTEL_fn)(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemAdviseINTEL(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_advice_intel advice,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemAdviseINTEL_fn)(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_advice_intel advice,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_INTEL_H */

View File

@ -1,159 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#include <CL/cl.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
#ifdef CL_VERSION_1_2
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
#define CL_GL_OBJECT_TEXTURE1D 0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
#endif
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
#ifdef CL_VERSION_1_2
#define CL_GL_NUM_SAMPLES 0x2012
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint bufobj,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint renderbuffer,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem memobj,
cl_gl_object_type * gl_object_type,
cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem memobj,
cl_gl_texture_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */

View File

@ -1,40 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl_gl.h>
/*
* cl_khr_gl_event extension
*/
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context context,
cl_GLsync cl_GLsync,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_EXT_H */

View File

@ -1,440 +0,0 @@
/*******************************************************************************
* Copyright (c) 2019-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/**
* This is a header-only utility library that provides OpenCL host code with
* routines for converting to/from cl_half values.
*
* Example usage:
*
* #include <CL/cl_half.h>
* ...
* cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
* cl_float f = cl_half_to_float(h);
*/
#ifndef OPENCL_CL_HALF_H
#define OPENCL_CL_HALF_H
#include <CL/cl_platform.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* Rounding mode used when converting to cl_half.
*/
typedef enum
{
CL_HALF_RTE, // round to nearest even
CL_HALF_RTZ, // round towards zero
CL_HALF_RTP, // round towards positive infinity
CL_HALF_RTN, // round towards negative infinity
} cl_half_rounding_mode;
/* Private utility macros. */
#define CL_HALF_EXP_MASK 0x7C00
#define CL_HALF_MAX_FINITE_MAG 0x7BFF
/*
* Utility to deal with values that overflow when converting to half precision.
*/
static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
uint16_t sign)
{
if (rounding_mode == CL_HALF_RTZ)
{
// Round overflow towards zero -> largest finite number (preserving sign)
return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
}
else if (rounding_mode == CL_HALF_RTP && sign)
{
// Round negative overflow towards positive infinity -> most negative finite number
return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
}
else if (rounding_mode == CL_HALF_RTN && !sign)
{
// Round positive overflow towards negative infinity -> largest finite number
return CL_HALF_MAX_FINITE_MAG;
}
// Overflow to infinity
return (sign << 15) | CL_HALF_EXP_MASK;
}
/*
* Utility to deal with values that underflow when converting to half precision.
*/
static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
uint16_t sign)
{
if (rounding_mode == CL_HALF_RTP && !sign)
{
// Round underflow towards positive infinity -> smallest positive value
return (sign << 15) | 1;
}
else if (rounding_mode == CL_HALF_RTN && sign)
{
// Round underflow towards negative infinity -> largest negative value
return (sign << 15) | 1;
}
// Flush to zero
return (sign << 15);
}
/**
* Convert a cl_float to a cl_half.
*/
static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
{
// Type-punning to get direct access to underlying bits
union
{
cl_float f;
uint32_t i;
} f32;
f32.f = f;
// Extract sign bit
uint16_t sign = f32.i >> 31;
// Extract FP32 exponent and mantissa
uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);
// Remove FP32 exponent bias
int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;
// Add FP16 exponent bias
uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1;
// Position of the bit that will become the FP16 mantissa LSB
uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;
// Check for NaN / infinity
if (f_exp == 0xFF)
{
if (f_mant)
{
// NaN -> propagate mantissa and silence it
uint16_t h_mant = f_mant >> lsb_pos;
h_mant |= 0x200;
return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
}
else
{
// Infinity -> zero mantissa
return (sign << 15) | CL_HALF_EXP_MASK;
}
}
// Check for zero
if (!f_exp && !f_mant)
{
return (sign << 15);
}
// Check for overflow
if (exp >= CL_HALF_MAX_EXP)
{
return cl_half_handle_overflow(rounding_mode, sign);
}
// Check for underflow
if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
{
return cl_half_handle_underflow(rounding_mode, sign);
}
// Check for value that will become denormal
if (exp < -14)
{
// Denormal -> include the implicit 1 from the FP32 mantissa
h_exp = 0;
f_mant |= 1 << (CL_FLT_MANT_DIG - 1);
// Mantissa shift amount depends on exponent
lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
}
// Generate FP16 mantissa by shifting FP32 mantissa
uint16_t h_mant = f_mant >> lsb_pos;
// Check whether we need to round
uint32_t halfway = 1 << (lsb_pos - 1);
uint32_t mask = (halfway << 1) - 1;
switch (rounding_mode)
{
case CL_HALF_RTE:
if ((f_mant & mask) > halfway)
{
// More than halfway -> round up
h_mant += 1;
}
else if ((f_mant & mask) == halfway)
{
// Exactly halfway -> round to nearest even
if (h_mant & 0x1)
h_mant += 1;
}
break;
case CL_HALF_RTZ:
// Mantissa has already been truncated -> do nothing
break;
case CL_HALF_RTP:
if ((f_mant & mask) && !sign)
{
// Round positive numbers up
h_mant += 1;
}
break;
case CL_HALF_RTN:
if ((f_mant & mask) && sign)
{
// Round negative numbers down
h_mant += 1;
}
break;
}
// Check for mantissa overflow
if (h_mant & 0x400)
{
h_exp += 1;
h_mant = 0;
}
return (sign << 15) | (h_exp << 10) | h_mant;
}
/**
* Convert a cl_double to a cl_half.
*/
static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
{
// Type-punning to get direct access to underlying bits
union
{
cl_double d;
uint64_t i;
} f64;
f64.d = d;
// Extract sign bit
uint16_t sign = f64.i >> 63;
// Extract FP64 exponent and mantissa
uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);
// Remove FP64 exponent bias
int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;
// Add FP16 exponent bias
uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);
// Position of the bit that will become the FP16 mantissa LSB
uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;
// Check for NaN / infinity
if (d_exp == 0x7FF)
{
if (d_mant)
{
// NaN -> propagate mantissa and silence it
uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
h_mant |= 0x200;
return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
}
else
{
// Infinity -> zero mantissa
return (sign << 15) | CL_HALF_EXP_MASK;
}
}
// Check for zero
if (!d_exp && !d_mant)
{
return (sign << 15);
}
// Check for overflow
if (exp >= CL_HALF_MAX_EXP)
{
return cl_half_handle_overflow(rounding_mode, sign);
}
// Check for underflow
if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
{
return cl_half_handle_underflow(rounding_mode, sign);
}
// Check for value that will become denormal
if (exp < -14)
{
// Include the implicit 1 from the FP64 mantissa
h_exp = 0;
d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);
// Mantissa shift amount depends on exponent
lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
}
// Generate FP16 mantissa by shifting FP64 mantissa
uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
// Check whether we need to round
uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
uint64_t mask = (halfway << 1) - 1;
switch (rounding_mode)
{
case CL_HALF_RTE:
if ((d_mant & mask) > halfway)
{
// More than halfway -> round up
h_mant += 1;
}
else if ((d_mant & mask) == halfway)
{
// Exactly halfway -> round to nearest even
if (h_mant & 0x1)
h_mant += 1;
}
break;
case CL_HALF_RTZ:
// Mantissa has already been truncated -> do nothing
break;
case CL_HALF_RTP:
if ((d_mant & mask) && !sign)
{
// Round positive numbers up
h_mant += 1;
}
break;
case CL_HALF_RTN:
if ((d_mant & mask) && sign)
{
// Round negative numbers down
h_mant += 1;
}
break;
}
// Check for mantissa overflow
if (h_mant & 0x400)
{
h_exp += 1;
h_mant = 0;
}
return (sign << 15) | (h_exp << 10) | h_mant;
}
/**
* Convert a cl_half to a cl_float.
*/
static inline cl_float cl_half_to_float(cl_half h)
{
// Type-punning to get direct access to underlying bits
union
{
cl_float f;
uint32_t i;
} f32;
// Extract sign bit
uint16_t sign = h >> 15;
// Extract FP16 exponent and mantissa
uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
uint16_t h_mant = h & 0x3FF;
// Remove FP16 exponent bias
int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;
// Add FP32 exponent bias
uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;
// Check for NaN / infinity
if (h_exp == 0x1F)
{
if (h_mant)
{
// NaN -> propagate mantissa and silence it
uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
f_mant |= 0x400000;
f32.i = (sign << 31) | 0x7F800000 | f_mant;
return f32.f;
}
else
{
// Infinity -> zero mantissa
f32.i = (sign << 31) | 0x7F800000;
return f32.f;
}
}
// Check for zero / denormal
if (h_exp == 0)
{
if (h_mant == 0)
{
// Zero -> zero exponent
f_exp = 0;
}
else
{
// Denormal -> normalize it
// - Shift mantissa to make most-significant 1 implicit
// - Adjust exponent accordingly
uint32_t shift = 0;
while ((h_mant & 0x400) == 0)
{
h_mant <<= 1;
shift++;
}
h_mant &= 0x3FF;
f_exp -= shift - 1;
}
}
f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
return f32.f;
}
#undef CL_HALF_EXP_MASK
#undef CL_HALF_MAX_FINITE_MAG
#ifdef __cplusplus
}
#endif
#endif /* OPENCL_CL_HALF_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,160 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_va_api_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <va/va.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************
* cl_intel_va_api_media_sharing extension *
*******************************************/
#define cl_intel_va_api_media_sharing 1
/* error codes */
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101
/* cl_va_api_device_source_intel */
#define CL_VA_API_DISPLAY_INTEL 0x4094
/* cl_va_api_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095
#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096
/* cl_context_info */
#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097
/* cl_mem_info */
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098
/* cl_image_info */
#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B
typedef cl_uint cl_va_api_device_source_intel;
typedef cl_uint cl_va_api_device_set_intel;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */

View File

@ -1,81 +0,0 @@
/*******************************************************************************
* Copyright (c) 2018-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __CL_VERSION_H
#define __CL_VERSION_H
/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
#define CL_TARGET_OPENCL_VERSION 220
#endif
#if CL_TARGET_OPENCL_VERSION != 100 && \
CL_TARGET_OPENCL_VERSION != 110 && \
CL_TARGET_OPENCL_VERSION != 120 && \
CL_TARGET_OPENCL_VERSION != 200 && \
CL_TARGET_OPENCL_VERSION != 210 && \
CL_TARGET_OPENCL_VERSION != 220 && \
CL_TARGET_OPENCL_VERSION != 300
#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)")
#undef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 220
#endif
/* OpenCL Version */
#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
#define CL_VERSION_3_0 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
#define CL_VERSION_2_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
#define CL_VERSION_2_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
#define CL_VERSION_2_0 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
#define CL_VERSION_1_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
#define CL_VERSION_1_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
#define CL_VERSION_1_0 1
#endif
/* Allow deprecated APIs for older OpenCL versions. */
#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif
#endif /* __CL_VERSION_H */

View File

@ -1,33 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_H
#define __OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_H */

View File

@ -7,7 +7,21 @@ project(BabelStream VERSION 3.5 LANGUAGES CXX)
# some nicer defaults for standard C++
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
include(FetchContent)
FetchContent_Declare(
opencl_header
URL https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v2021.06.30.zip
URL_HASH MD5=af7ab7918a6a11c60370c8651a9f0192
)
macro(setup_opencl_header_includes)
FetchContent_GetProperties(opencl_header)
if (NOT opencl_header_POPULATED)
FetchContent_Populate(opencl_header)
set(OpenCL_INCLUDE_DIR ${opencl_header_SOURCE_DIR})
endif ()
endmacro()
#set(MODEL SYCL)
#set(SYCL_COMPILER COMPUTECPP)
@ -35,7 +49,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
#set(CXX_EXTRA_FLAGS -O2)
#set(CMAKE_CXX_COMPILER /usr/lib/aomp/bin/clang++)
#set(MODEL OMP)
#set(MODEL omp)
##set(OFFLOAD "AMD:gfx803")
#set(OFFLOAD "NVIDIA:sm_35")
#set(CXX_EXTRA_FLAGS --cuda-path=/opt/cuda-10.2/)
@ -98,21 +112,23 @@ if ((DEFINED CXX_EXTRA_FLAGS) AND (NOT DEFINED CXX_EXTRA_LINK_FLAGS))
endif ()
# include our macros
include(register_models.cmake)
include(cmake/register_models.cmake)
# register out models <model_name> <preprocessor_def_name> <source files...>
register_model(OMP OMP OMPStream.cpp)
register_model(OCL OCL OCLStream.cpp)
register_model(STD STD STDStream.cpp)
register_model(STD20 STD20 STD20Stream.cpp)
register_model(HIP HIP HIPStream.cpp)
register_model(CUDA CUDA CUDAStream.cu)
register_model(KOKKOS KOKKOS KokkosStream.cpp)
register_model(SYCL SYCL SYCLStream.cpp)
register_model(ACC ACC ACCStream.cpp)
register_model(omp OMP OMPStream.cpp)
register_model(ocl OCL OCLStream.cpp)
register_model(std STD STDStream.cpp)
register_model(std20 STD20 STD20Stream.cpp)
register_model(hip HIP HIPStream.cpp)
register_model(cuda CUDA CUDAStream.cu)
register_model(kokkos KOKKOS KokkosStream.cpp)
register_model(sycl SYCL SYCLStream.cpp)
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(RAJA USE_RAJA RAJAStream.cpp)
register_model(TBB TBB TBBStream.cpp)
register_model(raja USE_RAJA RAJAStream.cpp)
register_model(tbb TBB TBBStream.cpp)
register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust
set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
@ -148,7 +164,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F
# setup common build flag defaults if there are no overrides
if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS})
elseif()
elseif ()
set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS})
endif ()
@ -170,7 +186,8 @@ message(STATUS "Executable : ${EXE_NAME}")
# below we have all the usual CMake target setup steps
add_executable(${EXE_NAME} ${IMPL_SOURCES} main.cpp)
include_directories(src)
add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
@ -185,9 +202,9 @@ target_link_options(${EXE_NAME} PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS})
target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
# some models require the target to be already specified so they can finish their setup here
# this only happens if the MODEL.cmake definition contains the `setup_target` macro
# this only happens if the model.cmake definition contains the `setup_target` macro
if (COMMAND setup_target)
setup_target(${EXE_NAME})
endif ()
install (TARGETS ${EXE_NAME} DESTINATION bin)
install(TARGETS ${EXE_NAME} DESTINATION bin)

View File

@ -1,40 +0,0 @@
CXXFLAGS=-O3
CUDA_CXX=nvcc
ifndef NVARCH
define nvarch_help
Set NVARCH to select sm_?? version.
Default: sm_60
endef
$(info $(nvarch_help))
NVARCH=sm_60
endif
ifndef MEM
define mem_help
Set MEM to select memory mode.
Available options:
DEFAULT - allocate host and device memory pointers.
MANAGED - use CUDA Managed Memory.
PAGEFAULT - shared memory, only host pointers allocated.
endef
$(info $(mem_help))
MEM=DEFAULT
endif
MEM_MANAGED= -DMANAGED
MEM_PAGEFAULT= -DPAGEFAULT
MEM_MODE = $(MEM_$(MEM))
cuda-stream: main.cpp CUDAStream.cu
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -arch=$(NVARCH) $(MEM_MODE) -DCUDA $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f cuda-stream

View File

@ -1,11 +0,0 @@
HIP_PATH?= /opt/rocm/hip
HIPCC=$(HIP_PATH)/bin/hipcc
hip-stream: main.cpp HIPStream.cpp
$(HIPCC) $(CXXFLAGS) -O3 -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f hip-stream

View File

@ -1,98 +0,0 @@
default: kokkos-stream
ifndef DEVICE
define device_help
Set DEVICE to change flags (defaulting to OpenMP).
Available devices are:
OpenMP, Serial, Pthreads, Cuda, HIP
endef
$(info $(device_help))
DEVICE="OpenMP"
endif
KOKKOS_DEVICES="$(DEVICE)"
ifndef ARCH
define arch_help
Set ARCH to change flags (defaulting to empty).
Available architectures are:
AMDAVX
ARMv80 ARMv81 ARMv8-ThunderX
BGQ Power7 Power8 Power9
WSM SNB HSW BDW SKX KNC KNL
Kepler30 Kepler32 Kepler35 Kepler37
Maxwell50 Maxwell52 Maxwell53
Pascal60 Pascal61
Volta70 Volta72
endef
$(info $(arch_help))
ARCH=""
endif
KOKKOS_ARCH="$(ARCH)"
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL CRAY PGI ARMCLANG HIPCC
Note: you may have to do `export CXX=\path\to\hipcc` in case Kokkos detects the wrong compiler
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_ARMCLANG = armclang++
COMPILER_HIPCC = hipcc
COMPILER_GNU = g++
COMPILER_INTEL = icpc -qopt-streaming-stores=always
COMPILER_CRAY = CC
COMPILER_PGI = pgc++
CXX = $(COMPILER_$(COMPILER))
ifndef TARGET
define target_help
Set TARGET to change to offload device. Defaulting to CPU.
Available targets are:
CPU (default)
GPU
endef
$(info $(target_help))
TARGET=CPU
endif
ifeq ($(TARGET), GPU)
ifneq ($(COMPILER), HIPCC)
CXX = $(NVCC_WRAPPER)
endif
endif
OBJ = main.o KokkosStream.o
CXXFLAGS = -O3
LINKFLAGS = # empty for now
ifeq ($(COMPILER), GNU)
ifeq ($(DEVICE), OpenMP)
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
endif
endif
include $(KOKKOS_PATH)/Makefile.kokkos
kokkos-stream: $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(CXX) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -DKOKKOS -o $@
%.o: %.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -DKOKKOS -c $<
.PHONY: clean
clean:
rm -f kokkos-stream main.o KokkosStream.o Kokkos_*.o

View File

@ -1,58 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to ensure correct flags are set.
Available compilers are:
PGI GNU
endef
$(info $(compiler_help))
endif
COMPILER_ = $(CXX)
COMPILER_PGI = pgc++
COMPILER_GNU = g++
FLAGS_ = -O3 -std=c++11
FLAGS_PGI = -std=c++11 -O3 -acc
ifeq ($(COMPILER), PGI)
define target_help
Set a TARGET to ensure PGI targets the correct offload device.
Available targets are:
SNB, IVB, HSW, SKL, KNL
PWR9, AMD
KEPLER, MAXWELL, PASCAL, VOLTA
HAWAII
endef
ifndef TARGET
$(error $(target_help))
endif
TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge
TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge
TARGET_FLAGS_HSW = -ta=multicore -tp=haswell
TARGET_FLAGS_SKL = -ta=multicore -tp=skylake
TARGET_FLAGS_KNL = -ta=multicore -tp=knl
TARGET_FLAGS_PWR9 = -ta=multicore -tp=pwr9
TARGET_FLAGS_AMD = -ta=multicore -tp=zen
TARGET_FLAGS_KEPLER = -ta=nvidia:cc35
TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
TARGET_FLAGS_PASCAL = -ta=nvidia:cc60
TARGET_FLAGS_VOLTA = -ta=nvidia:cc70
TARGET_FLAGS_HAWAII = -ta=radeon:hawaii
ifeq ($(TARGET_FLAGS_$(TARGET)),)
$(error $(target_help))
endif
FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
endif
FLAGS_GNU = -O3 -std=c++11 -Drestrict=__restrict -fopenacc
CXXFLAGS = $(FLAGS_$(COMPILER))
acc-stream: main.cpp ACCStream.cpp
$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f acc-stream main.o ACCStream.o

View File

@ -1,39 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU CLANG INTEL CRAY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_GNU = g++
COMPILER_CLANG = clang++
COMPILER_INTEL = icpc
COMPILER_CRAY = CC
CXX = $(COMPILER_$(COMPILER))
FLAGS_ = -O3 -std=c++11
FLAGS_GNU = -O3 -std=c++11
FLAGS_CLANG = -O3 -std=c++11
FLAGS_INTEL = -O3 -std=c++11
FLAGS_CRAY = -O3 -hstd=c++11
CXXFLAGS=$(FLAGS_$(COMPILER))
PLATFORM = $(shell uname -s)
ifeq ($(PLATFORM), Darwin)
LIBS = -framework OpenCL
else
LIBS = -lOpenCL
endif
ocl-stream: main.cpp OCLStream.cpp
$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
.PHONY: clean
clean:
rm -f ocl-stream

View File

@ -1,103 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
CLANG CRAY GNU GNU_PPC INTEL XL PGI
NEC ARMCLANG AOMP FUJITSU
Note: GCC on PPC requires -mcpu=native instead of -march=native so we have a special case for it
endef
$(info $(compiler_help))
COMPILER=GNU
endif
ifndef TARGET
define target_help
Set TARGET to change device (defaulting to CPU).
Available targets are:
CPU NVIDIA AMD INTEL_GPU
endef
$(info $(target_help))
TARGET=CPU
endif
ifeq ("$(COMPILER)", "CLANG")
ifdef TARGET
ifeq ("$(TARGET)", "NVIDIA")
ifndef NVARCH
define nvarch_help
Set NVARCH to select sm_?? version.
Default: sm_60
endef
$(info $(nvarch_help))
NVARCH=sm_60
endif
endif
endif
endif
COMPILER_ARMCLANG = armclang++
COMPILER_GNU = g++
COMPILER_GNU_PPC = g++
COMPILER_INTEL = icpc
COMPILER_CRAY = CC
COMPILER_CLANG = clang++
COMPILER_XL = xlc++
COMPILER_PGI = pgc++
COMPILER_NEC = /opt/nec/ve/bin/nc++
COMPILER_AOMP = clang++
COMPILER_FUJITSU=FCC
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++11 -march=native
FLAGS_GNU_PPC = -O3 -std=c++11 -mcpu=native
FLAGS_INTEL = -O3 -std=c++11
FLAGS_CRAY = -O3 -std=c++11
FLAGS_CLANG = -O3 -std=c++11
FLAGS_XL = -O5 -qarch=auto -qtune=auto -std=c++11
FLAGS_PGI = -O3 -std=c++11
FLAGS_NEC = -O4 -finline -std=c++11
FLAGS_ARMCLANG = -O3 -std=c++11
FLAGS_AOMP = -O3 -std=c++11
FLAGS_FUJITSU=-Kfast -std=c++11 -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16
CXXFLAGS = $(FLAGS_$(COMPILER))
# OpenMP flags for CPUs
OMP_ARMCLANG_CPU = -fopenmp
OMP_GNU_CPU = -fopenmp
OMP_GNU_PPC_CPU = -fopenmp
OMP_INTEL_CPU = -qopenmp
OMP_CRAY_CPU = -fopenmp
OMP_CLANG_CPU = -fopenmp=libomp
OMP_XL_CPU = -qsmp=omp -qthreaded
OMP_PGI_CPU = -mp
OMP_NEC_CPU = -fopenmp
OMP_FUJITSU_CPU=-Kopenmp
# OpenMP flags for NVIDIA
OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU
OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(NVARCH)
OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none
OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa
OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always -qopenmp
OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64
OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
ifndef OMP_$(COMPILER)_$(TARGET)
$(error Targeting $(TARGET) with $(COMPILER) not supported)
endif
OMP = $(OMP_$(COMPILER)_$(TARGET))
omp-stream: main.cpp OMPStream.cpp
$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f omp-stream

View File

@ -1,58 +0,0 @@
ifndef TARGET
define target_help
Set TARGET to change to offload device. Defaulting to CPU.
Available targets are:
CPU (default)
GPU
endef
$(info $(target_help))
TARGET=CPU
endif
ifeq ($(TARGET), CPU)
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
INTEL GNU CRAY XL
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_INTEL = icpc
CXX_GNU = g++
CXX_CRAY = CC
CXX_XL = xlc++
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
CXXFLAGS_CRAY = -O3 -hstd=c++11
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
else ifeq ($(TARGET), GPU)
CXX = nvcc
ifndef ARCH
define arch_help
Set ARCH to ensure correct GPU architecture.
Example:
ARCH=sm_35
endef
$(error $(arch_help))
endif
CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
endif
raja-stream: main.cpp RAJAStream.cpp
$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
.PHONY: clean
clean:
rm -f raja-stream

View File

@ -1,36 +0,0 @@
Android (outdated instructions)
------------------
Assuming you have a recent Android NDK available, you can use the
toolchain that it provides to build GPU-STREAM. You should first
use the NDK to generate a standalone toolchain:
# Select a directory to install the toolchain to
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
${NDK}/build/tools/make-standalone-toolchain.sh \
--platform=android-14 \
--toolchain=arm-linux-androideabi-4.8 \
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
Make sure that the OpenCL headers and library (libOpenCL.so) are
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
You should then be able to build GPU-STREAM:
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
Copy the executable and OpenCL kernels to the device:
adb push gpu-stream-ocl /data/local/tmp
adb push ocl-stream-kernels.cl /data/local/tmp
Run GPU-STREAM from an adb shell:
adb shell
cd /data/local/tmp
# Use float if device doesn't support double, and reduce array size
./gpu-stream-ocl --float -n 6 -s 10000000

174
README.md
View File

@ -1,31 +1,53 @@
BabelStream
==========
# BabelStream
<img src="https://github.com/UoB-HPC/BabelStream/blob/gh-pages/img/BabelStreamlogo.png?raw=true" alt="logo" height="300" align="right" />
[![CI](https://github.com/UoB-HPC/BabelStream/actions/workflows/main.yaml/badge.svg?branch=main)](https://github.com/UoB-HPC/BabelStream/actions/workflows/main.yaml)
Measure memory transfer rates to/from global device memory on GPUs.
This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for CPUs.
Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.
There are multiple implementations of this benchmark in a variety of programming models.
Currently implemented are:
- OpenCL
- CUDA
- OpenACC
- OpenMP 3 and 4.5
- C++ Parallel STL
- Kokkos
- RAJA
- SYCL
- TBB
There are multiple implementations of this benchmark in a variety of [programming models](#models).
This code was previously called GPU-STREAM.
## Table of Contents
- [Programming Models](#programming-models)
- [How is this different to STREAM?](#how-is-this-different-to-stream)
- [Building](#building)
- [CMake](#cmake)
- [GNU Make (removed)](#gnu-make)
- [Results](#results)
- [Contributing](#contributing)
- [Citing](#citing)
- [Other BabelStream publications](#other-babelstream-publications)
How is this different to STREAM?
--------------------------------
## Programming Models
BabelStream is currently implemented in the following parallel programming models, listed in no particular order:
- OpenCL
- CUDA
- HIP
- OpenACC
- OpenMP 3 and 4.5
- C++ Parallel STL
- Kokkos
- RAJA
- SYCL and SYCL 2020
- TBB
- Thrust (via CUDA or HIP)
This project also contains implementations in alternative languages with different build systems:
* Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl)
* Java - [java-stream](./src/java/java-stream)
* Scala - [scala-stream](./src/scala/scala-stream)
* Rust - [rust-stream](./src/rust/rust-stream)
## How is this different to STREAM?
BabelStream implements the four main kernels of the STREAM benchmark (along with a dot product), but by utilising different programming models expands the platforms which the code can run beyond CPUs.
@ -42,34 +64,46 @@ BabelStream therefore provides a measure of what memory bandwidth performance ca
BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels).
Details about PRK can be found in the following references:
> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
* Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
* R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
* Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
Website
-------
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
Usage
-----
## Building
Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
### CMake
The project supports building with CMake >= 3.13.0, it can be installed without root via the [official script](https://cmake.org/download/).
As with any CMake project, first configure the project:
The project supports building with CMake >= 3.13.0, which can be installed without root via the [official script](https://cmake.org/download/).
Each BabelStream implementation (programming model) is built as follows:
```shell
> cd babelstream
> cmake -Bbuild -H. -DMODEL=<model> <model specific flags prefixed with -D...> # configure the build, build type defaults to Release
> cmake --build build # compile it
> ./build/babelstream # executable available at ./build/
$ cd babelstream
# configure the build, build type defaults to Release
# The -DMODEL flag is required
$ cmake -Bbuild -H. -DMODEL=<model> <model specific flags prefixed with -D...>
# compile
$ cmake --build build
# run executables in ./build
$ ./build/<model>-stream
```
The `MODEL` option selects one implementation of BabelStream to build.
The source for each model's implementations are located in `./src/<model>`.
Currently available models are:
```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
```
#### Overriding default flags
By default, we have defined a set of optimal flags for known HPC compilers.
There are assigned those to `RELEASE_FLAGS`, and you can override them if required.
@ -77,7 +111,7 @@ To find out what flag each model supports or requires, simply configure while on
For example:
```shell
> cd babelstream
> cmake -Bbuild -H. -DMODEL=OCL
> cmake -Bbuild -H. -DMODEL=ocl
...
- Common Release flags are `-O3`, set RELEASE_FLAGS to override
-- CXX_EXTRA_FLAGS:
@ -91,90 +125,60 @@ For example:
Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
-- CXX_EXTRA_LINKER_FLAGS:
Append to linker flags (i.e GCC's `-Wl` or equivalent)
-- Available models: OMP;OCL;STD;STD20;HIP;CUDA;KOKKOS;SYCL;ACC;RAJA;TBB
-- Selected model : OCL
-- Available models: omp;ocl;std;std20;hip;cuda;kokkos;sycl;acc;raja;tbb
-- Selected model : ocl
-- Supported flags:
CMAKE_CXX_COMPILER (optional, default=c++): Any CXX compiler that is supported by CMake detection
OpenCL_LIBRARY (optional, default=): Path to OpenCL library, usually called libOpenCL.so
...
```
Alternatively, refer to the [CI script](./ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there.
*It is recommended that you delete the `build` directory when you change any of the build flags.*
Alternatively, refer to the [CI script](./src/ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there.
*It is recommended that you delete the `build` directory when you change any of the build flags.*
### GNU Make
We have supplied a series of Makefiles, one for each programming model, to assist with building.
The Makefiles contain common build options, and should be simple to customise for your needs too.
Support for Make has been removed from 4.0 onwards.
However, as the build process only involves a few source files, the required compile commands can be extracted from the CI output.
General usage is `make -f <Model>.make`
Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
<!-- TODO add CI snipped here -->
Pass in extra flags via the `EXTRA_FLAGS` option.
## Results
The binaries are named in the form `<model>-stream`.
Sample results can be found in the `results` subdirectory.
Newer results are found in our [Performance Portability](https://github.com/UoB-HPC/performance-portability) repository.
#### Building Kokkos for Make
Kokkos version >= 3 requires setting the `KOKKOS_PATH` flag to the *source* directory of a distribution.
For example:
```
cd
wget https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz
tar -xvf 3.1.01.tar.gz # should end up with ~/kokkos-3.1.01
cd BabelStream
make -f Kokkos.make KOKKOS_PATH=~/kokkos-3.1.01
```
See make output for more information on supported flags.
#### Building RAJA for Make
We use the following command to build RAJA using the Intel Compiler.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
```
For building with CUDA support, we use the following command.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
```
Results
-------
Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
Contributing
------------
## Contributing
As of v4.0, the `main` branch of this repository will hold the latest released version.
The `develop` branch will contain unreleased features due for the next (major and/or minor) release of BabelStream.
Pull Requests should be made against the `develop` branch.
Citing
------
## Citing
Please cite BabelStream via this reference:
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
**Other BabelStream publications:**
### Other BabelStream publications
> Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018.DOI: 10.1504/IJCSE.2018.095847
* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018.DOI: 10.1504/IJCSE.2018.095847
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
* Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
> Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017.
* Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017.
> Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States.
You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html)
* Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States.
You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html)
[1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995.

View File

@ -1,14 +0,0 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# For full license terms please see the LICENSE file distributed with this
# source code
CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
STD_CXX=nvc++
std-stream: main.cpp STDStream.cpp
$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f std-stream

View File

@ -1,26 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_GNU = g++
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++2a -march=native
CXXFLAGS = $(FLAGS_$(COMPILER))
std20-stream: main.cpp STD20Stream.cpp
$(CXX) -DSTD20 $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f std20-stream

View File

@ -1,81 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
HIPSYCL, DPCPP, COMPUTECPP
For HIPSYCL and COMPUTECPP, SYCL_SDK_DIR must be specified, the directory should contain [/lib, /bin, ...]
For DPCPP, the compiler must be on path
endef
$(info $(compiler_help))
COMPILER=HIPSYCL
endif
ifndef TARGET
define target_help
Set TARGET to change device (defaulting to CPU).
Available targets are:
CPU AMD NVIDIA
endef
$(info $(target_help))
TARGET=CPU
endif
ifndef ARCH
define arch_help
Set ARCH to change device (defaulting to "").
(GPU *only*) Available targets for HIPSYCL are:
For CUDA, the architecture has the form sm_XX, e.g. sm_60 for Pascal.
For ROCm, the architecture has the form gfxYYY, e.g. gfx900 for Vega 10, gfx906 for Vega 20.
endef
ifeq ($(COMPILER), HIPSYCL)
ifneq ($(TARGET), CPU)
$(info $(arch_help))
ARCH=
endif
endif
endif
SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) -no-serial-memop -sycl-driver
SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS)
SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS)
SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64
SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++
SYCL_COMPUTECPP_FLAGS = -O3 -std=c++17
SYCL_COMPUTECPP_LINK_FLAGS = -Wl,-rpath=$(SYCL_SDK_DIR)/lib/ $(SYCL_SDK_DIR)/lib/libComputeCpp.so -lOpenCL
SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include
SYCL_HIPSYCL_SYCLFLAGS_CPU = --hipsycl-platform=cpu
SYCL_HIPSYCL_SYCLFLAGS_AMD = --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH)
SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH)
SYCL_HIPSYCL_SYCLCXX = $(SYCL_SDK_DIR)/bin/syclcc
SYCL_HIPSYCL_FLAGS = -O3 --std=c++17
SYCL_HIPSYCL_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -Wl,-rpath,$(SYCL_SDK_DIR)/lib
SYCL_HIPSYCL_INCLUDE =
SYCL_DPCPP_SYCLFLAGS_NVIDIA = -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
SYCL_DPCPP_SYCLCXX = dpcpp
SYCL_DPCPP_FLAGS = -O3 --std=c++17
SYCL_DPCPP_LINK_FLAGS =
SYCL_DPCPP_INCLUDE =
SYCL_SYCLFLAGS = $(SYCL_$(COMPILER)_SYCLFLAGS_$(TARGET))
SYCL_SYCLCXX = $(SYCL_$(COMPILER)_SYCLCXX)
SYCL_FLAGS = $(SYCL_$(COMPILER)_FLAGS)
SYCL_LINK_FLAGS = $(SYCL_$(COMPILER)_LINK_FLAGS)
SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE)
# only ComputeCpp generates .sycl files which is a bit odd to deal with so we opted to compile everything together
sycl-stream: main.cpp SYCLStream.cpp
$(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@
.PHONY: clean
clean:
rm -f sycl-stream

View File

@ -1,56 +0,0 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL INTEL_LEGACY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_GNU = g++
CXX_INTEL = icpx
CXX_INTEL_LEGACY = icpc
CXX = $(COMPILER_$(COMPILER))
CXXFLAGS_GNU = -march=native
CXXFLAGS_INTEL = -march=native
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
ifndef PARTITIONER
define partitioner_help
Set PARTITIONER to select TBB's partitioner.
Partitioner specifies how a loop template should partition its work among threads.
Available options:
AUTO - Optimize range subdivision based on work-stealing events.
AFFINITY - Proportional splitting that optimizes for cache affinity.
STATIC - Distribute work uniformly with no additional load balancing.
SIMPLE - Recursively split its range until it cannot be further subdivided.
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
for more details.
endef
$(info $(partitioner_help))
PARTITIONER=AUTO
endif
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
tbb-stream: main.cpp TBBStream.cpp
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
.PHONY: clean
clean:
rm -f tbb-stream

View File

@ -131,22 +131,24 @@ endfunction()
macro(register_model NAME PREPROCESSOR_NAME)
string(TOUPPER ${NAME} MODEL_UPPER)
list(APPEND REGISTERED_MODELS "${NAME}")
list(APPEND IMPL_${MODEL_UPPER}_SOURCES "${ARGN}")
string(TOUPPER ${NAME} MODEL_UPPER)
list(APPEND IMPL_${MODEL_UPPER}_SOURCES "src/${NAME}/${ARGN}")
list(APPEND IMPL_${MODEL_UPPER}_DEFINITIONS "${PREPROCESSOR_NAME}")
endmacro()
macro(load_model MODEL)
string(TOUPPER "${MODEL}" MODEL_UPPER)
if ("${MODEL_UPPER}" IN_LIST REGISTERED_MODELS)
set(MODEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${MODEL_UPPER}.cmake)
if ("${MODEL}" IN_LIST REGISTERED_MODELS)
string(TOLOWER "${MODEL}" MODEL_LOWER)
set(MODEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/${MODEL_LOWER}/model.cmake)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/${MODEL_LOWER})
if (NOT EXISTS ${MODEL_FILE})
message(FATAL_ERROR "${MODEL_FILE} not found, perhaps it needs to be implemented?")
endif ()
include(${MODEL_FILE})
string(TOUPPER "${MODEL}" MODEL_UPPER)
list(APPEND IMPL_SOURCES ${IMPL_${MODEL_UPPER}_SOURCES})
list(APPEND IMPL_DEFINITIONS ${IMPL_${MODEL_UPPER}_DEFINITIONS})

View File

@ -1,21 +0,0 @@
HCC = hcc
CXXFLAGS+=-O3 $(shell hcc-config --cxxflags)
LDFLAGS+=$(shell hcc-config --ldflags)
ifdef TBSIZE
CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE)
endif
ifdef NTILES
CXXFLAGS+=-DNTILES=$(TBSIZE)
endif
hc-stream: ../main.cpp HCStream.cpp
$(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f hc-stream

29
src/.gitignore vendored Normal file
View File

@ -0,0 +1,29 @@
**/cuda-stream
**/ocl-stream
**/omp-stream
**/acc-stream
**/raja-stream
**/kokkos-stream
**/std-stream
**/sycl-stream
**/hip-stream
**/*.o
**/*.bc
**/*.sycl
**/*.tar
**/*.gz
**/*.a
**/KokkosCore_Config_*
**/.DS_Store
build/
cmake-build-*/
CMakeFiles/
.idea/
.vscode/
.directory

View File

@ -135,17 +135,20 @@ setup_aocc() {
setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local tarball="nvhpc.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc_2021_215_Linux_x86_64_cuda_11.3.tar.gz"
# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz"
get_and_untar "$tarball" "$url"
local sdk_dir="$PWD/nvhpc_2021_215_Linux_x86_64_cuda_11.3/install_components/Linux_x86_64/21.5"
local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9"
local bin_dir="$sdk_dir/compilers/bin"
"$bin_dir/makelocalrc" "$bin_dir" -x
export_var NVHPC_SDK_DIR "$sdk_dir"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4"
export_var NVHPC_NVCXX "$bin_dir/nvc++"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.3/bin/nvcc"
export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.3"
export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc"
echo "Installed CUDA versions:"
ls "$sdk_dir/cuda"
verify_bin_exists "$NVHPC_NVCXX"
@ -222,10 +225,7 @@ setup_tbb() {
setup_clang_gcc() {
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
sudo apt-get update -qq
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10
sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev
export_var GCC_CXX "$(which g++-10)"
verify_bin_exists "$GCC_CXX"
@ -248,11 +248,9 @@ setup_clang_gcc() {
}
setup_rocm() {
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq
sudo apt-get install -y -qq rocm-dev
sudo apt-get install -y -qq rocm-dev rocthrust-dev
export_var ROCM_PATH "/opt/rocm"
export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work
export_var HIP_CXX "$ROCM_PATH/bin/hipcc"
verify_bin_exists "$HIP_CXX"
"$HIP_CXX" --version
@ -316,9 +314,21 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then
echo "Running in GitHub Actions, defaulting to special export"
TERM=xterm
export TERM=xterm
# drop the lock in case we got one from a failed run
rm /var/lib/dpkg/lock-frontend || true
rm /var/cache/apt/archives/lock || true
wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add -
echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update -qq
sudo apt-get install -y -qq cmake
if [ "$SETUP" = true ]; then
echo "Deleting extra packages for space in 5 seconds..."
sleep 5
echo "Deleting extra packages for space in 2 seconds..."
sleep 2
echo "Starting apt-get remove:"
sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel
sudo apt-get autoremove -y

View File

@ -57,7 +57,7 @@ run_build() {
local cmake_code=$?
"$CMAKE_BIN" --build "$build" -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
"$CMAKE_BIN" --build "$build" --target install -j "$(nproc)" &>>"$log"
local cmake_code=$?
set -e
@ -88,32 +88,30 @@ run_build() {
###
# KOKKOS_SRC="/home/tom/Downloads/kokkos-3.3.00"
# RAJA_SRC="/home/tom/Downloads/RAJA-v0.13.0"
# GCC_CXX="/usr/bin/g++"
#
# GCC_CXX="$(which g++-10)"
# CLANG_CXX="/usr/bin/clang++"
# NVSDK="/home/tom/Downloads/nvhpc_2021_212_Linux_x86_64_cuda_11.2/install_components/Linux_x86_64/21.2/"
# NVHPC_NVCXX="$NVSDK/compilers/bin/nvc++"
# NVHPC_NVCC="$NVSDK/cuda/11.2/bin/nvcc"
# NVHPC_CUDA_DIR="$NVSDK/cuda/11.2"
# "$NVSDK/compilers/bin/makelocalrc" "$NVSDK/compilers/bin/" -x
#
# NVHPC_SDK_DIR="/home/tom/Downloads/nvhpc_2021_219_Linux_x86_64_cuda_multi/install_components/Linux_x86_64/21.9/"
# NVHPC_NVCXX="$NVHPC_SDK_DIR/compilers/bin/nvc++"
# NVHPC_NVCC="$NVHPC_SDK_DIR/cuda/11.4/bin/nvcc"
# NVHPC_CUDA_DIR="$NVHPC_SDK_DIR/cuda/11.4"
# "$NVHPC_SDK_DIR/compilers/bin/makelocalrc" "$NVHPC_SDK_DIR/compilers/bin/" -x
#
# AOCC_CXX="/opt/AMD/aocc-compiler-2.3.0/bin/clang++"
# AOMP_CXX="/usr/lib/aomp/bin/clang++"
# OCL_LIB="/home/tom/Downloads/oclcpuexp-2020.11.11.0.04_rel/x64/libOpenCL.so"
#
# # AMD needs this rocm_path thing exported...
# export ROCM_PATH="/opt/rocm-4.0.0"
# HIP_CXX="/opt/rocm-4.0.0/bin/hipcc"
# COMPUTECPP_DIR="/home/tom/Desktop/computecpp_archive/ComputeCpp-CE-2.3.0-x86_64-linux-gnu"
# export ROCM_PATH="/opt/rocm-4.5.0"
# HIP_CXX="/opt/rocm-4.5.0/bin/hipcc"
# COMPUTECPP_DIR="/home/tom/Downloads/ComputeCpp-CE-2.7.0-x86_64-linux-gnu/"
# DPCPP_DIR="/home/tom/Downloads/dpcpp_compiler"
# HIPSYCL_DIR="/opt/hipsycl/cff515c/"
# ICPX_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/icpx"
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.1.2/linux/bin/intel64/icpc"
# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
#
# ICPX_CXX="/opt/intel/oneapi/compiler/2021.4.0/linux/bin/icpx"
# ICPC_CXX="/opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/icpc"# TBB_LIB="/home/tom/Downloads/oneapi-tbb-2021.1.1/"
#
# GCC_STD_PAR_LIB="tbb"
# CLANG_STD_PAR_LIB="tbb"
# GCC_OMP_OFFLOAD_AMD=false
@ -124,102 +122,136 @@ run_build() {
AMD_ARCH="gfx_903"
NV_ARCH="sm_70"
NV_ARCH_CCXY="cuda11.3,cc80"
NV_ARCH_CCXY="cuda11.4,cc80"
build_gcc() {
local name="gcc_build"
local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
run_build $name "${GCC_CXX:?}" OMP "$cxx"
run_build $name "${GCC_CXX:?}" omp "$cxx"
if [ "$MODEL" = "all" ] || [ "$MODEL" = "OMP" ]; then
# sanity check that it at least runs
echo "Sanity checking GCC OMP build..."
"./$BUILD_DIR/OMP_$name/omp-stream" -s 1048576 -n 10
echo "Sanity checking GCC omp build..."
"./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10
fi
# some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
run_build $name "${GCC_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}"
run_build $name "${GCC_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
fi
if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
run_build "nvidia_$name" "${GCC_CXX:?}" ACC "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none"
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${GCC_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
# run_build $name "${CC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${GCC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
# run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON"
run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102
# FIXME we also got https://github.com/NVIDIA/nccl/issues/494
# run_build "cuda_$name" "${GCC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \
# -DENABLE_CUDA=ON \
# -DTARGET=NVIDIA \
# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \
# -DCUDA_ARCH=$NV_ARCH"
# CMake >= 3.15 only due to Nvidia's Thrust CMake requirements
local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3)
local required="3.15.0"
if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP"
run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP"
# FIXME CUDA Thrust + TBB throws the following error:
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9155): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14797): error: identifier "__builtin_ia32_rndscaless_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(14806): error: identifier "__builtin_ia32_rndscalesd_round" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined
# /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined
# run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB"
else
echo "CMake version ${current} < ${required}, skipping Thrust models"
fi
}
build_clang() {
local name="clang_build"
local cxx="-DCMAKE_CXX_COMPILER=${CLANG_CXX:?}"
run_build $name "${CLANG_CXX:?}" OMP "$cxx"
run_build $name "${CLANG_CXX:?}" omp "$cxx"
if [ "${CLANG_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
run_build "amd_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH"
fi
if [ "${CLANG_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then
run_build "nvidia_$name" "${GCC_CXX:?}" OMP "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH"
fi
run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" CUDA "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${CLANG_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" STD "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" STD20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" TBB "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" TBB "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED"
run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT"
run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${CLANG_CXX:?}" std "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}"
# run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported
run_build $name "${CLANG_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB
run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
# no clang /w RAJA+cuda because it needs nvcc which needs gcc
}
build_nvhpc() {
local name="nvhpc_build"
local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
run_build $name "${NVHPC_NVCXX:?}" STD "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" ACC "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
}
build_aocc() {
run_build aocc_build "${AOCC_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}"
run_build aocc_build "${AOCC_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}"
}
build_aomp() {
run_build aomp_amd_build "${AOMP_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=AMD:gfx906"
run_build aomp_amd_build "${AOMP_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=AMD:gfx906"
#run_build aomp_nvidia_build "-DCMAKE_CXX_COMPILER=${AOMP_CXX:?} -DOFFLOAD=NVIDIA:$NV_ARCH"
}
build_hip() {
run_build hip_build "${HIP_CXX:?}" HIP "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
local name="hip_build"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
}
build_icpx() {
@ -227,7 +259,7 @@ build_icpx() {
set +u
source /opt/intel/oneapi/setvars.sh -force || true
set -u
run_build intel_build "${ICPX_CXX:?}" OMP "-DCMAKE_CXX_COMPILER=${ICPX_CXX:?} -DOFFLOAD=INTEL"
run_build intel_build "${ICPX_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${ICPX_CXX:?} -DOFFLOAD=INTEL"
}
build_icpc() {
@ -237,31 +269,31 @@ build_icpc() {
set -u
local name="intel_build"
local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
run_build $name "${ICPC_CXX:?}" OMP "$cxx"
run_build $name "${ICPC_CXX:?}" OCL "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${ICPC_CXX:?}" RAJA "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${ICPC_CXX:?}" KOKKOS "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
run_build $name "${ICPC_CXX:?}" omp "$cxx"
run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}"
run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON"
}
build_computecpp() {
run_build computecpp_build "compute++" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
-DSYCL_COMPILER=COMPUTECPP \
-DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \
-DOpenCL_LIBRARY=${OCL_LIB:?}"
}
build_dpcpp() {
run_build intel_build "${DPCPP_DIR:?}" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
run_build intel_build "${DPCPP_DIR:?}" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \
-DSYCL_COMPILER=DPCPP \
-DSYCL_COMPILER_DIR=${DPCPP_DIR:?}"
# for oneAPI BaseKit:
# source /opt/intel/oneapi/setvars.sh -force
# run_build intel_build "dpcpp" SYCL "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP"
# run_build intel_build "dpcpp" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP"
}
build_hipsycl() {
run_build hipsycl_build "syclcc" SYCL "
run_build hipsycl_build "syclcc" sycl "
-DSYCL_COMPILER=HIPSYCL \
-DSYCL_COMPILER_DIR=${HIPSYCL_DIR:?}"
}

128
src/java/java-stream/.gitignore vendored Normal file
View File

@ -0,0 +1,128 @@
## File-based project format:
.idea
*.iws
*.iml
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
### VisualStudioCode template
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### Linux template
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# Windows thumbnail cache files
Thumbs.db
ehthumbs.db
ehthumbs_vista.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msm
*.msp
# Windows shortcuts
*.lnk
### Maven template
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
!/.mvn/wrapper/maven-wrapper.jar
### Java template
# Compiled class file
*.class
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### macOS template
*.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
!.mvn/**/*
settings.xml

Binary file not shown.

View File

@ -0,0 +1 @@
distributionUrl=https://repo1.maven.org/maven2/org/apache/maven/apache-maven/3.5.0/apache-maven-3.5.0-bin.zip

View File

@ -0,0 +1,172 @@
java-stream
===========
This is an implementation of BabelStream in Java 8 which contains the following implementations:
* `jdk-plain` - Single threaded `for`
* `jdk-stream` - Threaded implementation using JDK8's parallel stream API
* `tornadovm` - A [TornadoVM](https://github.com/beehive-lab/TornadoVM) implementation for
PTX/OpenCL
* `aparapi` - A [Aparapi](https://git.qoto.org/aparapi/aparapi) implementation for OpenCL
### Build & Run
Prerequisites
* JDK >= 8
To run the benchmark, first create a binary:
```shell
> cd java-stream
> ./mvnw clean package
```
The binary will be located at `./target/java-stream.jar`. Run it with:
```shell
> java -version  ✔  11.0.11+9 ☕  tom@soraws-uk  05:03:20
openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05)
OpenJDK 64-Bit Server VM GraalVM CE 21.1.0 (build 11.0.11+8-jvmci-21.1-b05, mixed mode)
> java -jar target/java-stream.jar --help
```
For best results, benchmark with the following JVM flags:
```
-XX:-UseOnStackReplacement # disable OSR, not useful for this benchmark as we are measuring peak performance
-XX:-TieredCompilation # disable C1, go straight to C2
-XX:ReservedCodeCacheSize=512m # don't flush compiled code out of cache at any point
```
Worked example:
```shell
> java -XX:-UseOnStackReplacement -XX:-TieredCompilation -XX:ReservedCodeCacheSize=512m -jar target/java-stream.jar
BabelStream
Version: 3.4
Implementation: jdk-stream; (Java 11.0.11;Red Hat, Inc.; home=/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-4.fc33.x86_64)
Running all 100 times
Precision: double
Array size: 268.4 MB (=0.3 GB)
Total size: 805.3 MB (=0.8 GB)
Function MBytes/sec Min (sec) Max Average
Copy 17145.538 0.03131 0.04779 0.03413
Mul 16759.092 0.03203 0.04752 0.03579
Add 19431.954 0.04144 0.05866 0.04503
Triad 19763.970 0.04075 0.05388 0.04510
Dot 26646.894 0.02015 0.03013 0.02259
```
If your OpenCL/CUDA installation is not at the default location, TornadoVM and Aparapi may fail to
detect your devices. In those cases, you may specify the library directly, for example:
```shell
> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 java -jar target/java-stream.jar ...
```
### Instructions for TornadoVM
The TornadoVM implementation requires you to run the binary with a patched JVM. Follow the
official [instructions](https://github.com/beehive-lab/TornadoVM/blob/master/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md)
or use the following simplified instructions:
Prerequisites
* CMake >= 3.6
* GCC or clang/LLVM (GCC >= 5.5)
* Python >= 2.7
* Maven >= 3.6.3
* OpenCL headers >= 1.2 and/or CUDA SDK >= 9.0
First, get a copy of the TornadoVM source:
```shell
> cd
> git clone https://github.com/beehive-lab/TornadoVM tornadovm
```
Take note of the required GraalVM version
in `tornadovm/assembly/src/docs/10_INSTALL_WITH_GRAALVM.md`. We'll use `21.1.0` in this example.
Now, obtain a copy of GraalVM and make sure the version matches the one required by TornadoVM:
```shell
> wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-21.1.0/graalvm-ce-java11-linux-amd64-21.1.0.tar.gz
> tar -xf graalvm-ce-java11-linux-amd64-21.1.0.tar.gz
```
Next, create `~/tornadovm/etc/sources.env` and populate the file with the following:
```shell
#!/bin/bash
export JAVA_HOME=<path to GraalVM 21.1.0 jdk>
export PATH=$PWD/bin/bin:$PATH
export TORNADO_SDK=$PWD/bin/sdk
export CMAKE_ROOT=/usr # path to CMake binary
```
Proceed to compile TornadoVM:
```shell
> cd ~/tornadovm
> . etc/sources.env
> make graal-jdk-11-plus BACKEND={ptx,opencl}
```
To test your build, source the environment file:
```shell
> source ~/tornadovm/etc/sources.env
> LD_PRELOAD=/opt/rocm-4.0.0/opencl/lib/libOpenCL.so.1.2 tornado --devices
Number of Tornado drivers: 1
Total number of OpenCL devices : 3
Tornado device=0:0
AMD Accelerated Parallel Processing -- gfx1012
Global Memory Size: 4.0 GB
Local Memory Size: 64.0 KB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [1024, 1024, 1024]
Device OpenCL C version: OpenCL C 2.0
Tornado device=0:1
Portable Computing Language -- pthread-AMD Ryzen 9 3900X 12-Core Processor
Global Memory Size: 60.7 GB
Local Memory Size: 8.0 MB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [4096, 4096, 4096]
Device OpenCL C version: OpenCL C 1.2 pocl
Tornado device=0:2
NVIDIA CUDA -- NVIDIA GeForce GT 710
Global Memory Size: 981.3 MB
Local Memory Size: 48.0 KB
Workgroup Dimensions: 3
Max WorkGroup Configuration: [1024, 1024, 64]
Device OpenCL C version: OpenCL C 1.2
```
You can now use TornadoVM to run java-stream:
```shell
> tornado -jar ~/java-stream/target/java-stream.jar --impl tornadovm --arraysize 65536  1 ✘  11.0.11+9 ☕  tom@soraws-uk  05:31:34
BabelStream
Version: 3.4
Implementation: tornadovm; (Java 11.0.11;GraalVM Community; home=~/graalvm-ce-java11-21.1.0)
Running all 100 times
Precision: double
Array size: 0.5 MB (=0.0 GB)
Total size: 1.6 MB (=0.0 GB)
Using TornadoVM device:
- Name : NVIDIA GeForce GT 710 CL_DEVICE_TYPE_GPU (available)
- Id : opencl-0-0
- Platform : NVIDIA CUDA
- Backend : OpenCL
Function MBytes/sec Min (sec) Max Average
Copy 8791.100 0.00012 0.00079 0.00015
Mul 8774.107 0.00012 0.00061 0.00014
Add 9903.313 0.00016 0.00030 0.00018
Triad 9861.031 0.00016 0.00030 0.00018
Dot 2799.465 0.00037 0.00056 0.00041
```

225
src/java/java-stream/mvnw vendored Executable file
View File

@ -0,0 +1,225 @@
#!/bin/sh
# ----------------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Maven2 Start Up Batch script
#
# Required ENV vars:
# ------------------
# JAVA_HOME - location of a JDK home dir
#
# Optional ENV vars
# -----------------
# M2_HOME - location of maven2's installed home dir
# MAVEN_OPTS - parameters passed to the Java VM when running Maven
# e.g. to debug Maven itself, use
# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
# MAVEN_SKIP_RC - flag to disable loading of mavenrc files
# ----------------------------------------------------------------------------
if [ -z "$MAVEN_SKIP_RC" ] ; then
if [ -f /etc/mavenrc ] ; then
. /etc/mavenrc
fi
if [ -f "$HOME/.mavenrc" ] ; then
. "$HOME/.mavenrc"
fi
fi
# OS specific support. $var _must_ be set to either true or false.
cygwin=false;
darwin=false;
mingw=false
case "`uname`" in
CYGWIN*) cygwin=true ;;
MINGW*) mingw=true;;
Darwin*) darwin=true
# Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
# See https://developer.apple.com/library/mac/qa/qa1170/_index.html
if [ -z "$JAVA_HOME" ]; then
if [ -x "/usr/libexec/java_home" ]; then
export JAVA_HOME="`/usr/libexec/java_home`"
else
export JAVA_HOME="/Library/Java/Home"
fi
fi
;;
esac
if [ -z "$JAVA_HOME" ] ; then
if [ -r /etc/gentoo-release ] ; then
JAVA_HOME=`java-config --jre-home`
fi
fi
if [ -z "$M2_HOME" ] ; then
## resolve links - $0 may be a link to maven's home
PRG="$0"
# need this for relative symlinks
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG="`dirname "$PRG"`/$link"
fi
done
saveddir=`pwd`
M2_HOME=`dirname "$PRG"`/..
# make it fully qualified
M2_HOME=`cd "$M2_HOME" && pwd`
cd "$saveddir"
# echo Using m2 at $M2_HOME
fi
# For Cygwin, ensure paths are in UNIX format before anything is touched
if $cygwin ; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --unix "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
fi
# For Migwn, ensure paths are in UNIX format before anything is touched
if $mingw ; then
[ -n "$M2_HOME" ] &&
M2_HOME="`(cd "$M2_HOME"; pwd)`"
[ -n "$JAVA_HOME" ] &&
JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
# TODO classpath?
fi
if [ -z "$JAVA_HOME" ]; then
javaExecutable="`which javac`"
if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
# readlink(1) is not available as standard on Solaris 10.
readLink=`which readlink`
if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
if $darwin ; then
javaHome="`dirname \"$javaExecutable\"`"
javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
else
javaExecutable="`readlink -f \"$javaExecutable\"`"
fi
javaHome="`dirname \"$javaExecutable\"`"
javaHome=`expr "$javaHome" : '\(.*\)/bin'`
JAVA_HOME="$javaHome"
export JAVA_HOME
fi
fi
fi
if [ -z "$JAVACMD" ] ; then
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
else
JAVACMD="`which java`"
fi
fi
if [ ! -x "$JAVACMD" ] ; then
echo "Error: JAVA_HOME is not defined correctly." >&2
echo " We cannot execute $JAVACMD" >&2
exit 1
fi
if [ -z "$JAVA_HOME" ] ; then
echo "Warning: JAVA_HOME environment variable is not set."
fi
CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
# traverses directory structure from process work directory to filesystem root
# first directory with .mvn subdirectory is considered project base directory
find_maven_basedir() {
if [ -z "$1" ]
then
echo "Path not specified to find_maven_basedir"
return 1
fi
basedir="$1"
wdir="$1"
while [ "$wdir" != '/' ] ; do
if [ -d "$wdir"/.mvn ] ; then
basedir=$wdir
break
fi
# workaround for JBEAP-8937 (on Solaris 10/Sparc)
if [ -d "${wdir}" ]; then
wdir=`cd "$wdir/.."; pwd`
fi
# end of workaround
done
echo "${basedir}"
}
# concatenates all lines of a file
concat_lines() {
if [ -f "$1" ]; then
echo "$(tr -s '\n' ' ' < "$1")"
fi
}
BASE_DIR=`find_maven_basedir "$(pwd)"`
if [ -z "$BASE_DIR" ]; then
exit 1;
fi
export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}
echo $MAVEN_PROJECTBASEDIR
MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
# For Cygwin, switch paths to Windows format before running java
if $cygwin; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --path --windows "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
[ -n "$MAVEN_PROJECTBASEDIR" ] &&
MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"`
fi
WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
exec "$JAVACMD" \
$MAVEN_OPTS \
-classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
"-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"

143
src/java/java-stream/mvnw.cmd vendored Normal file
View File

@ -0,0 +1,143 @@
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM http://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Maven2 Start Up Batch script
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM M2_HOME - location of maven2's installed home dir
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM enable echoing my setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
%MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%" == "on" pause
if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
exit /B %ERROR_CODE%

View File

@ -0,0 +1,133 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>java-stream</artifactId>
<groupId>javastream</groupId>
<version>3.4.0</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<junit.version>5.7.2</junit.version>
</properties>
<repositories>
<repository>
<id>universityOfManchester-graal</id>
<url>https://raw.githubusercontent.com/beehive-lab/tornado/maven-tornadovm</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.81</version>
</dependency>
<dependency>
<groupId>tornado</groupId>
<artifactId>tornado-api</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>com.aparapi</groupId>
<artifactId>aparapi</artifactId>
<version>2.0.0</version>
<exclusions>
<!-- don't pull in the entire Scala ecosystem! -->
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<target>1.8</target>
<source>1.8</source>
<compilerArgument>-Xlint:all</compilerArgument>
<showWarnings>true</showWarnings>
<showDeprecation>true</showDeprecation>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M5</version>
</plugin>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>javastream.Main</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.MF</exclude>
</excludes>
</filter>
</filters>
<finalName>${project.artifactId}</finalName>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.coveo</groupId>
<artifactId>fmt-maven-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<goals>
<goal>format</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,45 @@
package javastream;
/**
* This class represents our Fractional typeclass. Java's type system isn't unified so we have to do
* insane things for parametric operations on fractional types.
*/
@SuppressWarnings("unchecked")
public final class FractionalMaths {
private FractionalMaths() {
throw new AssertionError();
}
public static <T extends Number> T from(Class<T> evidence, Number n) {
if (evidence == Double.TYPE || evidence == Double.class)
return (T) Double.valueOf(n.doubleValue());
else if (evidence == Float.TYPE || evidence == Float.class)
return (T) Float.valueOf(n.floatValue());
throw new IllegalArgumentException();
}
public static <T extends Number> T plus(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() + y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() + y.floatValue());
throw new IllegalArgumentException();
}
static <T extends Number> T minus(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() - y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() - y.floatValue());
throw new IllegalArgumentException();
}
public static <T extends Number> T times(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() * y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() * y.floatValue());
throw new IllegalArgumentException();
}
static <T extends Number> T divide(T x, T y) {
if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() / y.doubleValue());
else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() / y.floatValue());
throw new IllegalArgumentException();
}
}

View File

@ -0,0 +1,172 @@
package javastream;
import java.time.Duration;
import java.util.AbstractMap;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.Main.Config;
public abstract class JavaStream<T> {
public static final class Data<T> {
final T[] a, b, c;
public Data(T[] a, T[] b, T[] c) {
this.a = Objects.requireNonNull(a);
this.b = Objects.requireNonNull(b);
this.c = Objects.requireNonNull(c);
}
}
static final class Timings<T> {
final List<T> copy = new ArrayList<>();
final List<T> mul = new ArrayList<>();
final List<T> add = new ArrayList<>();
final List<T> triad = new ArrayList<>();
final List<T> dot = new ArrayList<>();
}
protected final Config<T> config;
protected JavaStream(Config<T> config) {
this.config = config;
}
protected abstract List<String> listDevices();
protected abstract void initArrays();
protected abstract void copy();
protected abstract void mul();
protected abstract void add();
protected abstract void triad();
protected abstract void nstream();
protected abstract T dot();
protected abstract Data<T> data();
public static class EnumeratedStream<T> extends JavaStream<T> {
protected final JavaStream<T> actual;
private final Entry<String, Function<Config<T>, JavaStream<T>>>[] options;
@SafeVarargs
@SuppressWarnings("varargs")
public EnumeratedStream(
Config<T> config, Entry<String, Function<Config<T>, JavaStream<T>>>... options) {
super(config);
this.actual = options[config.options.device].getValue().apply(config);
this.options = options;
}
@Override
protected List<String> listDevices() {
return Arrays.stream(options).map(Entry::getKey).collect(Collectors.toList());
}
@Override
public void initArrays() {
actual.initArrays();
}
@Override
public void copy() {
actual.copy();
}
@Override
public void mul() {
actual.mul();
}
@Override
public void add() {
actual.add();
}
@Override
public void triad() {
actual.triad();
}
@Override
public void nstream() {
actual.nstream();
}
@Override
public T dot() {
return actual.dot();
}
@Override
public Data<T> data() {
return actual.data();
}
}
public static Double[] boxed(double[] xs) {
return Arrays.stream(xs).boxed().toArray(Double[]::new);
}
public static Float[] boxed(float[] xs) {
return IntStream.range(0, xs.length).mapToObj(i -> xs[i]).toArray(Float[]::new);
}
private static <T> AbstractMap.SimpleImmutableEntry<Duration, T> timed(Supplier<T> f) {
long start = System.nanoTime();
T r = f.get();
long end = System.nanoTime();
return new AbstractMap.SimpleImmutableEntry<>(Duration.ofNanos(end - start), r);
}
private static Duration timed(Runnable f) {
long start = System.nanoTime();
f.run();
long end = System.nanoTime();
return Duration.ofNanos(end - start);
}
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
Timings<Duration> timings = new Timings<>();
T lastSum = null;
for (int i = 0; i < times; i++) {
timings.copy.add(timed(this::copy));
timings.mul.add(timed(this::mul));
timings.add.add(timed(this::add));
timings.triad.add(timed(this::triad));
SimpleImmutableEntry<Duration, T> dot = timed(this::dot);
timings.dot.add(dot.getKey());
lastSum = dot.getValue();
}
return new SimpleImmutableEntry<>(timings, lastSum);
}
final Duration runTriad(int times) {
return timed(
() -> {
for (int i = 0; i < times; i++) {
triad();
}
});
}
final List<Duration> runNStream(int times) {
return IntStream.range(0, times)
.mapToObj(i -> timed(this::nstream))
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,425 @@
package javastream;
import static javastream.FractionalMaths.divide;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.minus;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import java.time.Duration;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.Arrays;
import java.util.DoubleSummaryStatistics;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;
import javastream.JavaStream.Data;
import javastream.JavaStream.Timings;
import javastream.aparapi.AparapiStreams;
import javastream.jdk.JdkStreams;
import javastream.jdk.PlainStream;
import javastream.tornadovm.TornadoVMStreams;
public class Main {
enum Benchmark {
NSTREAM,
TRIAD,
ALL
}
public static class Options {
@Parameter(names = "--list", description = "List available devices for all implementations")
boolean list = false;
@Parameter(
names = "--device",
description = "Select device at <device>, see --list for options")
public int device = 0;
@Parameter(
names = "--impl",
description = "Select implementation at <impl>, see --list for options")
public String impl = "";
@Parameter(
names = {"--numtimes", "-n"},
description = "Run the test <numtimes> times (NUM >= 2)")
public int numtimes = 100;
@Parameter(
names = {"--arraysize", "-s"},
description = "Use <arraysize> elements in the array")
public int arraysize = 33554432;
@Parameter(names = "--float", description = "Use floats (rather than doubles)")
public boolean useFloat = false;
@Parameter(names = "--triad-only", description = "Only run triad")
public boolean triadOnly = false;
@Parameter(names = "--nstream-only", description = "Only run nstream")
public boolean nstreamOnly = false;
@Parameter(names = "--csv", description = "Output as csv table")
public boolean csv = false;
@Parameter(
names = "--mibibytes",
description = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)")
public boolean mibibytes = false;
@Parameter(names = "--dot-tolerance", description = "Tolerance for dot kernel verification")
public double dotTolerance = 1.0e-8;
public boolean isVerboseBenchmark() {
return !list && !csv;
}
}
public static final class Config<T> {
public final Options options;
public final Benchmark benchmark;
public final int typeSize;
public final Class<T> evidence;
public final T ulp, scalar, initA, initB, initC;
public Config(
Options options,
Benchmark benchmark,
int typeSize,
Class<T> evidence,
T ulp,
T scalar,
T initA,
T initB,
T initC) {
this.options = Objects.requireNonNull(options);
this.benchmark = Objects.requireNonNull(benchmark);
this.typeSize = typeSize;
this.evidence = Objects.requireNonNull(evidence);
this.ulp = Objects.requireNonNull(ulp);
this.scalar = Objects.requireNonNull(scalar);
this.initA = Objects.requireNonNull(initA);
this.initB = Objects.requireNonNull(initB);
this.initC = Objects.requireNonNull(initC);
}
}
static final class Implementation {
final String name;
final Function<Config<Float>, JavaStream<Float>> makeFloat;
final Function<Config<Double>, JavaStream<Double>> makeDouble;
Implementation(
String name,
Function<Config<Float>, JavaStream<Float>> makeFloat,
Function<Config<Double>, JavaStream<Double>> makeDouble) {
this.name = Objects.requireNonNull(name);
this.makeFloat = Objects.requireNonNull(makeFloat);
this.makeDouble = Objects.requireNonNull(makeDouble);
}
}
static <T extends Number> boolean run(
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
Options opt = config.options;
int arrayBytes = opt.arraysize * config.typeSize;
int totalBytes = arrayBytes * 3;
String megaSuffix = opt.mibibytes ? "MiB" : "MB";
String gigaSuffix = opt.mibibytes ? "GiB" : "GB";
double megaScale = opt.mibibytes ? Math.pow(2.0, -20) : 1.0e-6;
double gigaScale = opt.mibibytes ? Math.pow(2.0, -30) : 1.0e-9;
if (!opt.csv) {
String vendor = System.getProperty("java.vendor");
String ver = System.getProperty("java.version");
String home = System.getProperty("java.home");
System.out.println("BabelStream");
System.out.printf("Version: %s%n", VERSION);
System.out.printf(
"Implementation: %s (Java %s; %s; JAVA_HOME=%s)%n", name, ver, vendor, home);
final String benchmarkName;
switch (config.benchmark) {
case NSTREAM:
benchmarkName = "nstream";
break;
case TRIAD:
benchmarkName = "triad";
break;
case ALL:
benchmarkName = "all";
break;
default:
throw new AssertionError("Unexpected value: " + config.benchmark);
}
System.out.println("Running " + benchmarkName + " " + opt.numtimes + " times");
if (config.benchmark == Benchmark.TRIAD) {
System.out.println("Number of elements: " + opt.arraysize);
}
System.out.println("Precision: " + (opt.useFloat ? "float" : "double"));
System.out.printf(
"Array size: %.1f %s (=%.1f %s)%n",
(megaScale * arrayBytes), megaSuffix, (gigaScale * arrayBytes), gigaSuffix);
System.out.printf(
"Total size: %.1f %s (=%.1f %s)%n",
(megaScale * totalBytes), megaSuffix, (gigaScale * totalBytes), gigaSuffix);
}
JavaStream<T> stream = mkStream.apply(config);
stream.initArrays();
final boolean ok;
switch (config.benchmark) {
case ALL:
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
case NSTREAM:
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
case TRIAD:
Duration triadResult = stream.runTriad(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
default:
throw new AssertionError();
}
return ok;
}
private static <T extends Number> boolean checkWithinTolerance(
String name, T[] xs, T gold, T tolerance) {
// it's ok to default to double for error calculation
double error =
Arrays.stream(xs)
.mapToDouble(x -> Math.abs(minus(x, gold).doubleValue()))
.summaryStatistics()
.getAverage();
boolean failed = error > tolerance.doubleValue();
if (failed) {
System.err.printf("Validation failed on %s. Average error %s%n", name, error);
}
return !failed;
}
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
static <T extends Number> boolean checkSolutions(
Data<T> data, Config<T> config, Optional<T> dotSum) {
T goldA = config.initA;
T goldB = config.initB;
T goldC = config.initC;
for (int i = 0; i < config.options.numtimes; i++) {
switch (config.benchmark) {
case ALL:
goldC = goldA;
goldB = times(config.scalar, goldC);
goldC = plus(goldA, goldB);
goldA = plus(goldB, times(config.scalar, goldC));
break;
case TRIAD:
goldA = plus(goldB, times(config.scalar, goldC));
break;
case NSTREAM:
goldA = plus(goldA, plus(goldB, times(config.scalar, goldC)));
break;
}
}
T tolerance = times(config.ulp, from(config.evidence, 100));
boolean aValid = checkWithinTolerance("a", data.a, goldA, tolerance);
boolean bValid = checkWithinTolerance("b", data.b, goldB, tolerance);
boolean cValid = checkWithinTolerance("c", data.c, goldC, tolerance);
final T finalGoldA = goldA;
final T finalGoldB = goldB;
boolean sumValid =
dotSum
.map(
actual -> {
T goldSum =
times(
times(finalGoldA, finalGoldB),
from(config.evidence, config.options.arraysize));
double error = Math.abs(divide(minus(actual, goldSum), goldSum).doubleValue());
boolean failed = error > config.options.dotTolerance;
if (failed) {
System.err.printf(
"Validation failed on sum. Error %s \nSum was %s but should be %s%n",
error, actual, goldSum);
}
return !failed;
})
.orElse(true);
return aValid && bValid && cValid && sumValid;
}
private static double durationToSeconds(Duration d) {
return d.toNanos() / (double) TimeUnit.SECONDS.toNanos(1);
}
private static List<Entry<String, String>> mkCsvRow(
List<Duration> xs, String name, int totalBytes, double megaScale, Options opt) {
DoubleSummaryStatistics stats =
xs.stream().skip(1).mapToDouble(Main::durationToSeconds).summaryStatistics();
if (stats.getCount() <= 0) {
throw new IllegalArgumentException("No min/max for " + name + "(size=" + totalBytes + ")");
}
double mbps = megaScale * (double) totalBytes / stats.getMin();
return opt.csv
? Arrays.asList(
new SimpleImmutableEntry<>("function", name),
new SimpleImmutableEntry<>("num_times", opt.numtimes + ""),
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
new SimpleImmutableEntry<>(
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", mbps + ""),
new SimpleImmutableEntry<>("min_runtime", stats.getMin() + ""),
new SimpleImmutableEntry<>("max_runtime", stats.getMax() + ""),
new SimpleImmutableEntry<>("avg_runtime", stats.getAverage() + ""))
: Arrays.asList(
new SimpleImmutableEntry<>("Function", name),
new SimpleImmutableEntry<>(
"M" + (opt.mibibytes ? "i" : "") + "Bytes/sec", String.format("%.3f", mbps)),
new SimpleImmutableEntry<>("Min (sec)", String.format("%.5f", stats.getMin())),
new SimpleImmutableEntry<>("Max", String.format("%.5f", stats.getMax())),
new SimpleImmutableEntry<>("Average", String.format("%.5f", stats.getAverage())));
}
private static String padSpace(String s, int length) {
if (length == 0) return s;
return String.format("%1$-" + length + "s", s);
}
@SafeVarargs
@SuppressWarnings("varargs")
private static void tabulateCsv(boolean csv, List<Entry<String, String>>... rows) {
if (rows.length == 0) throw new IllegalArgumentException("Empty tabulation");
int padding = csv ? 0 : 12;
String sep = csv ? "," : "";
System.out.println(
rows[0].stream().map(x -> padSpace(x.getKey(), padding)).collect(Collectors.joining(sep)));
for (List<Entry<String, String>> row : rows) {
System.out.println(
row.stream().map(x -> padSpace(x.getValue(), padding)).collect(Collectors.joining(sep)));
}
}
private static final String VERSION = "3.4";
private static final float START_SCALAR = 0.4f;
private static final float START_A = 0.1f;
private static final float START_B = 0.2f;
private static final float START_C = 0.0f;
private static final List<Implementation> IMPLEMENTATIONS =
Arrays.asList(
new Implementation("jdk-stream", JdkStreams.FLOAT, JdkStreams.DOUBLE),
new Implementation("jdk-plain", PlainStream.FLOAT, PlainStream.DOUBLE),
new Implementation("tornadovm", TornadoVMStreams.FLOAT, TornadoVMStreams.DOUBLE),
new Implementation("aparapi", AparapiStreams.FLOAT, AparapiStreams.DOUBLE));
public static int run(String[] args) {
Options opt = new Options();
JCommander.newBuilder().addObject(opt).build().parse(args);
final Benchmark benchmark;
if (opt.nstreamOnly && opt.triadOnly)
throw new RuntimeException(
"Both triad and nstream are enabled, pick one or omit both to run all benchmarks");
else if (opt.nstreamOnly) benchmark = Benchmark.NSTREAM;
else if (opt.triadOnly) benchmark = Benchmark.TRIAD;
else benchmark = Benchmark.ALL;
final Config<Float> floatConfig =
new Config<>(
opt,
benchmark,
Float.BYTES,
Float.class, // XXX not Float.TYPE, we want the boxed one
Math.ulp(1.f),
START_SCALAR,
START_A,
START_B,
START_C);
final Config<Double> doubleConfig =
new Config<>(
opt,
benchmark,
Double.BYTES,
Double.class, // XXX not Double.TYPE, we want the boxed one
Math.ulp(1.d),
(double) START_SCALAR,
(double) START_A,
(double) START_B,
(double) START_C);
if (opt.list) {
System.out.println("Set implementation with --impl <IMPL> and device with --device <N>:");
for (Implementation entry : IMPLEMENTATIONS) {
System.out.println("Implementation: " + entry.name);
try {
List<String> devices = entry.makeDouble.apply(doubleConfig).listDevices();
for (int i = 0; i < devices.size(); i++) {
System.out.println("\t[" + i + "] " + devices.get(i));
}
} catch (Exception e) {
System.out.println("\t(Unsupported: " + e.getMessage() + ")");
}
}
return 0;
}
String implName = (opt.impl.isEmpty()) ? IMPLEMENTATIONS.get(0).name : opt.impl;
Implementation impl =
IMPLEMENTATIONS.stream()
.filter(x -> implName.compareToIgnoreCase(x.name) == 0)
.findFirst()
.orElseThrow(
() ->
new IllegalArgumentException("Implementation " + opt.impl + " does not exist"));
boolean ok =
opt.useFloat
? run(impl.name, floatConfig, impl.makeFloat)
: run(impl.name, doubleConfig, impl.makeDouble);
return ok ? 0 : 1;
}
public static void main(String[] args) {
System.exit(run(args));
}
}

View File

@ -0,0 +1,129 @@
package javastream.aparapi;
import com.aparapi.device.Device;
import com.aparapi.device.Device.TYPE;
import com.aparapi.device.JavaDevice;
import com.aparapi.device.OpenCLDevice;
import com.aparapi.internal.kernel.KernelManager;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javastream.JavaStream;
import javastream.Main.Config;
public final class AparapiStreams {
private AparapiStreams() {}
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config -> new Generic<>(config, SpecialisedDoubleKernel::new);
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config -> new Generic<>(config, SpecialisedFloatKernel::new);
private static List<Device> enumerateDevices() {
// JavaDevice.SEQUENTIAL doesn't work when arraysize > 1, so we omit it entirely
Stream<JavaDevice> cpuDevices = Stream.of(JavaDevice.ALTERNATIVE_ALGORITHM);
Stream<OpenCLDevice> clDevices =
Stream.of(TYPE.values()).map(OpenCLDevice::listDevices).flatMap(Collection::stream);
return Stream.concat(clDevices, cpuDevices).collect(Collectors.toList());
}
private static String deviceName(Device device) {
return device.toString();
}
private static final class Generic<T extends Number> extends JavaStream<T> {
private final GenericAparapiStreamKernel<T> kernels;
Generic(Config<T> config, GenericAparapiStreamKernel.Factory<T> factory) {
super(config);
Device device = enumerateDevices().get(config.options.device);
final int numGroups;
final int workGroupSize;
if (device instanceof JavaDevice) {
numGroups = Runtime.getRuntime().availableProcessors();
workGroupSize =
config.typeSize * 2; // closest thing to CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE
} else if (device instanceof OpenCLDevice) {
numGroups = ((OpenCLDevice) device).getMaxComputeUnits();
workGroupSize = device.getMaxWorkGroupSize();
} else {
throw new AssertionError("Unknown device type " + device.getClass());
}
if (config.options.isVerboseBenchmark()) {
System.out.println("Using Aparapi OpenCL device: " + device);
System.out.println(" - numGroups : " + numGroups);
System.out.println(" - workGroupSize : " + workGroupSize);
String showCL = System.getProperty("com.aparapi.enableShowGeneratedOpenCL");
if (showCL == null || !showCL.equals("true")) {
System.out.println(
"(Add `-Dcom.aparapi.enableShowGeneratedOpenCL=true` to show generated OpenCL source)");
}
}
LinkedHashSet<Device> candidate = new LinkedHashSet<>();
candidate.add(device);
kernels = factory.create(config, numGroups, workGroupSize);
KernelManager.instance().setPreferredDevices(kernels, candidate);
}
@Override
public List<String> listDevices() {
return enumerateDevices().stream()
.map(AparapiStreams::deviceName)
.collect(Collectors.toList());
}
@Override
public void initArrays() {
kernels.init();
}
@Override
public void copy() {
kernels.copy();
}
@Override
public void mul() {
kernels.mul();
}
@Override
public void add() {
kernels.add();
}
@Override
public void triad() {
kernels.triad();
}
@Override
public void nstream() {
kernels.nstream();
}
@Override
public T dot() {
return kernels.dot();
}
@Override
public Data<T> data() {
return kernels.syncAndDispose();
}
}
}

View File

@ -0,0 +1,68 @@
package javastream.aparapi;
import com.aparapi.Kernel;
import com.aparapi.Range;
import javastream.JavaStream.Data;
import javastream.Main.Config;
abstract class GenericAparapiStreamKernel<T> extends Kernel {
protected static final int FN_COPY = 1;
protected static final int FN_MUL = 2;
protected static final int FN_ADD = 3;
protected static final int FN_TRIAD = 4;
protected static final int FN_NSTREAM = 5;
protected static final int FN_DOT = 6;
protected final Config<T> config;
protected final int arraysize, numGroups, workGroupSize;
interface Factory<T> {
GenericAparapiStreamKernel<T> create(Config<T> config, int numGroups, int workGroupSize);
}
GenericAparapiStreamKernel(Config<T> config, int numGroups, int workGroupSize) {
this.config = config;
this.arraysize = config.options.arraysize;
this.numGroups = numGroups;
this.workGroupSize = workGroupSize;
setExplicit(true);
}
protected int function;
public abstract void init();
public void copy() {
function = FN_COPY;
execute(arraysize);
}
public void mul() {
function = FN_MUL;
execute(arraysize);
}
public void add() {
function = FN_ADD;
execute(arraysize);
}
public void triad() {
function = FN_TRIAD;
execute(arraysize);
}
public void nstream() {
function = FN_NSTREAM;
execute(arraysize);
}
protected Kernel partialDot() {
function = FN_DOT;
return execute(Range.create(numGroups * workGroupSize, workGroupSize));
}
abstract T dot();
abstract Data<T> syncAndDispose();
}

View File

@ -0,0 +1,74 @@
package javastream.aparapi;
import java.util.Arrays;
import javastream.JavaStream;
import javastream.JavaStream.Data;
import javastream.Main.Config;
final class SpecialisedDoubleKernel extends GenericAparapiStreamKernel<Double> {
private final double scalar;
final double[] a, b, c;
private final double[] partialSum;
@Local private final double[] workGroupSum;
SpecialisedDoubleKernel(Config<Double> config, int numGroups, int workGroupSize) {
super(config, numGroups, workGroupSize);
this.scalar = config.scalar;
this.a = new double[this.arraysize];
this.b = new double[this.arraysize];
this.c = new double[this.arraysize];
this.partialSum = new double[numGroups];
this.workGroupSum = new double[workGroupSize];
}
@SuppressWarnings("DuplicatedCode")
@Override
public void run() {
int i = getGlobalId();
if (function == FN_COPY) {
c[i] = a[i];
} else if (function == FN_MUL) {
b[i] = scalar * c[i];
} else if (function == FN_ADD) {
c[i] = a[i] + b[i];
} else if (function == FN_TRIAD) {
a[i] = b[i] + scalar * c[i];
} else if (function == FN_NSTREAM) {
a[i] += b[i] + scalar * c[i];
} else if (function == FN_DOT) {
int localId = getLocalId(0);
workGroupSum[localId] = 0.0;
for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i];
for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) {
localBarrier();
if (localId < offset) {
workGroupSum[localId] += workGroupSum[localId + offset];
}
}
if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId];
}
}
@Override
public void init() {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
put(a).put(b).put(c);
}
@Override
public Double dot() {
partialDot().get(partialSum);
double sum = 0;
for (double v : partialSum) sum += v;
return sum;
}
@Override
public Data<Double> syncAndDispose() {
get(a).get(b).get(c).dispose();
return new Data<>(JavaStream.boxed(a), JavaStream.boxed(b), JavaStream.boxed(c));
}
}

View File

@ -0,0 +1,75 @@
package javastream.aparapi;
import static javastream.JavaStream.boxed;
import java.util.Arrays;
import javastream.JavaStream.Data;
import javastream.Main.Config;
final class SpecialisedFloatKernel extends GenericAparapiStreamKernel<Float> {
private final float scalar;
final float[] a, b, c;
private final float[] partialSum;
@Local private final float[] workGroupSum;
SpecialisedFloatKernel(Config<Float> config, int numGroups, int workGroupSize) {
super(config, numGroups, workGroupSize);
this.scalar = config.scalar;
this.a = new float[this.arraysize];
this.b = new float[this.arraysize];
this.c = new float[this.arraysize];
this.partialSum = new float[numGroups];
this.workGroupSum = new float[workGroupSize];
}
@SuppressWarnings("DuplicatedCode")
@Override
public void run() {
int i = getGlobalId();
if (function == FN_COPY) {
c[i] = a[i];
} else if (function == FN_MUL) {
b[i] = scalar * c[i];
} else if (function == FN_ADD) {
c[i] = a[i] + b[i];
} else if (function == FN_TRIAD) {
a[i] = b[i] + scalar * c[i];
} else if (function == FN_NSTREAM) {
a[i] += b[i] + scalar * c[i];
} else if (function == FN_DOT) {
int localId = getLocalId(0);
workGroupSum[localId] = 0.f;
for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i];
for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) {
localBarrier();
if (localId < offset) {
workGroupSum[localId] += workGroupSum[localId + offset];
}
}
if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId];
}
}
@Override
public void init() {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
put(a).put(b).put(c);
}
@Override
public Float dot() {
partialDot().get(partialSum);
float sum = 0;
for (float v : partialSum) sum += v;
return sum;
}
@Override
public Data<Float> syncAndDispose() {
get(a).get(b).get(c).dispose();
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,92 @@
package javastream.jdk;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import java.lang.reflect.Array;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class GenericPlainStream<T extends Number> extends JavaStream<T> {
private final T[] a;
private final T[] b;
private final T[] c;
@SuppressWarnings("unchecked")
GenericPlainStream(Config<T> config) {
super(config);
this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = times(config.scalar, c[i]);
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = plus(a[i], b[i]);
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = plus(b[i], times(config.scalar, c[i]));
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = plus(a[i], plus(b[i], times(config.scalar, c[i])));
}
}
@Override
public T dot() {
T acc = from(config.evidence, 0);
for (int i = 0; i < config.options.arraysize; i++) {
acc = plus(acc, times(a[i], b[i]));
}
return acc;
}
@Override
public Data<T> data() {
return new Data<>(a, b, c);
}
}

View File

@ -0,0 +1,86 @@
package javastream.jdk;
import static javastream.FractionalMaths.from;
import static javastream.FractionalMaths.plus;
import static javastream.FractionalMaths.times;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.FractionalMaths;
import javastream.JavaStream;
import javastream.Main.Config;
/**
* We use
*
* <pre>Arrays.parallelSetAll</pre>
*
* <p>here as it internally calls
*
* <pre>IntStream.range(0, array.length).parallel().forEach(...)</pre>
*/
final class GenericStream<T extends Number> extends JavaStream<T> {
private final T[] a, b, c;
@SuppressWarnings("unchecked")
GenericStream(Config<T> config) {
super(config);
this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize);
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
Arrays.parallelSetAll(a, i -> config.initA);
Arrays.parallelSetAll(b, i -> config.initB);
Arrays.parallelSetAll(c, i -> config.initC);
}
@Override
public void copy() {
Arrays.parallelSetAll(c, i -> a[i]);
}
@Override
public void mul() {
Arrays.parallelSetAll(b, i -> times(config.scalar, c[i]));
}
@Override
public void add() {
Arrays.parallelSetAll(c, i -> plus(a[i], b[i]));
}
@Override
public void triad() {
Arrays.parallelSetAll(a, i -> plus(b[i], times(config.scalar, c[i])));
}
@Override
public void nstream() {
Arrays.parallelSetAll(a, i -> plus(a[i], plus(b[i], times(config.scalar, c[i]))));
}
@Override
public T dot() {
return IntStream.range(0, config.options.arraysize)
.parallel()
.mapToObj(i -> times(a[i], b[i]))
.reduce(from(config.evidence, 0), FractionalMaths::plus);
}
@Override
public Data<T> data() {
return new Data<>(a, b, c);
}
}

View File

@ -0,0 +1,26 @@
package javastream.jdk;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.function.Function;
import javastream.JavaStream;
import javastream.JavaStream.EnumeratedStream;
import javastream.Main.Config;
public final class JdkStreams {
private JdkStreams() {}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedFloatStream::new),
new SimpleImmutableEntry<>("generic", GenericStream::new));
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedDoubleStream::new),
new SimpleImmutableEntry<>("generic", GenericStream::new));
}

View File

@ -0,0 +1,26 @@
package javastream.jdk;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.function.Function;
import javastream.JavaStream;
import javastream.JavaStream.EnumeratedStream;
import javastream.Main.Config;
public final class PlainStream {
private PlainStream() {}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedPlainFloatStream::new),
new SimpleImmutableEntry<>("generic", GenericPlainStream::new));
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE =
config ->
new EnumeratedStream<>(
config,
new SimpleImmutableEntry<>("specialised", SpecialisedPlainDoubleStream::new),
new SimpleImmutableEntry<>("generic", GenericPlainStream::new));
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedDoubleStream extends JavaStream<Double> {
private final double[] a, b, c;
SpecialisedDoubleStream(Config<Double> config) {
super(config);
this.a = new double[config.options.arraysize];
this.b = new double[config.options.arraysize];
this.c = new double[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(
i -> {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
});
}
@Override
public void copy() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i]);
}
@Override
public void mul() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> b[i] = config.scalar * c[i]);
}
@Override
public void add() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i] + b[i]);
}
@Override
public void triad() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] = b[i] + config.scalar * c[i]);
}
@Override
public void nstream() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] += b[i] + config.scalar * c[i]);
}
@Override
public Double dot() {
return IntStream.range(0, config.options.arraysize)
.parallel()
.mapToDouble(i -> a[i] * b[i])
.reduce(0f, Double::sum);
}
@Override
public Data<Double> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedFloatStream extends JavaStream<Float> {
private final float[] a, b, c;
SpecialisedFloatStream(Config<Float> config) {
super(config);
this.a = new float[config.options.arraysize];
this.b = new float[config.options.arraysize];
this.c = new float[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(
i -> {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
});
}
@Override
public void copy() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i]);
}
@Override
public void mul() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> b[i] = config.scalar * c[i]);
}
@Override
public void add() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> c[i] = a[i] + b[i]);
}
@Override
public void triad() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] = b[i] + config.scalar * c[i]);
}
@Override
public void nstream() {
IntStream.range(0, config.options.arraysize) //
.parallel()
.forEach(i -> a[i] += b[i] + config.scalar * c[i]);
}
@Override
public Float dot() {
return IntStream.range(0, config.options.arraysize) //
.parallel()
.mapToObj(i -> a[i] * b[i]) // XXX there isn't a specialised Stream for floats
.reduce(0f, Float::sum);
}
@Override
public Data<Float> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
private final double[] a;
private final double[] b;
private final double[] c;
SpecialisedPlainDoubleStream(Config<Double> config) {
super(config);
this.a = new double[config.options.arraysize];
this.b = new double[config.options.arraysize];
this.c = new double[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = config.scalar * c[i];
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i] + b[i];
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = b[i] + config.scalar * c[i];
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] += b[i] + config.scalar * c[i];
}
}
@Override
public Double dot() {
double acc = 0f;
for (int i = 0; i < config.options.arraysize; i++) {
acc += a[i] * b[i];
}
return acc;
}
@Override
public Data<Double> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,84 @@
package javastream.jdk;
import java.util.Collections;
import java.util.List;
import javastream.JavaStream;
import javastream.Main.Config;
final class SpecialisedPlainFloatStream extends JavaStream<Float> {
private final float[] a;
private final float[] b;
private final float[] c;
SpecialisedPlainFloatStream(Config<Float> config) {
super(config);
this.a = new float[config.options.arraysize];
this.b = new float[config.options.arraysize];
this.c = new float[config.options.arraysize];
}
@Override
public List<String> listDevices() {
return Collections.singletonList("JVM");
}
@Override
public void initArrays() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = config.initA;
b[i] = config.initB;
c[i] = config.initC;
}
}
@SuppressWarnings("ManualArrayCopy")
@Override
public void copy() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i];
}
}
@Override
public void mul() {
for (int i = 0; i < config.options.arraysize; i++) {
b[i] = config.scalar * c[i];
}
}
@Override
public void add() {
for (int i = 0; i < config.options.arraysize; i++) {
c[i] = a[i] + b[i];
}
}
@Override
public void triad() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] = b[i] + config.scalar * c[i];
}
}
@Override
public void nstream() {
for (int i = 0; i < config.options.arraysize; i++) {
a[i] += b[i] + config.scalar * c[i];
}
}
@Override
public Float dot() {
float acc = 0f;
for (int i = 0; i < config.options.arraysize; i++) {
acc += a[i] * b[i];
}
return acc;
}
@Override
public Data<Float> data() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,98 @@
package javastream.tornadovm;
import java.util.List;
import java.util.stream.Collectors;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskSchedule;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
protected final TornadoDevice device;
protected TaskSchedule copyTask;
protected TaskSchedule mulTask;
protected TaskSchedule addTask;
protected TaskSchedule triadTask;
protected TaskSchedule nstreamTask;
protected TaskSchedule dotTask;
GenericTornadoVMStream(Config<T> config) {
super(config);
try {
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime();
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
device = devices.get(config.options.device);
if (config.options.isVerboseBenchmark()) {
System.out.println("Using TornadoVM device:");
System.out.println(" - Name : " + device.getDescription());
System.out.println(" - Id : " + device.getDeviceName());
System.out.println(" - Platform : " + device.getPlatformName());
System.out.println(" - Backend : " + device.getTornadoVMBackend().name());
}
} catch (Throwable e) {
throw new RuntimeException(
"Unable to initialise TornadoVM, make sure you are running the binary with the `tornado -jar ...` wrapper and not `java -jar ...`",
e);
}
}
protected static TaskSchedule mkSchedule() {
return new TaskSchedule("");
}
@Override
public List<String> listDevices() {
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
.map(d -> d.getDescription() + "(" + d.getDeviceName() + ")")
.collect(Collectors.toList());
}
@Override
public void initArrays() {
this.copyTask.warmup();
this.mulTask.warmup();
this.addTask.warmup();
this.triadTask.warmup();
this.nstreamTask.warmup();
this.dotTask.warmup();
}
@Override
public void copy() {
this.copyTask.execute();
}
@Override
public void mul() {
this.mulTask.execute();
}
@Override
public void add() {
this.addTask.execute();
}
@Override
public void triad() {
this.triadTask.execute();
}
@Override
public void nstream() {
this.nstreamTask.execute();
}
protected abstract T getSum();
@Override
public T dot() {
this.dotTask.execute();
return getSum();
}
}

View File

@ -0,0 +1,88 @@
package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
@SuppressWarnings("ManualArrayCopy")
private static void copy(int size, double[] a, double[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i];
}
}
private static void mul(int size, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
b[i] = scalar * c[i];
}
}
private static void add(int size, double[] a, double[] b, double[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
private static void triad(int size, double[] a, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] + scalar * c[i];
}
}
private static void nstream(int size, double[] a, double[] b, double[] c, double scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] * scalar * c[i];
}
}
private static void dot_(
double[] a, double[] b, @Reduce double[] acc) { // prevent name clash with CL's dot
acc[0] = 0;
for (@Parallel int i = 0; i < a.length; i++) {
acc[0] += a[i] * b[i];
}
}
private final double[] a, b, c;
private final double[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
SpecialisedDouble(Config<Double> config) {
super(config);
final int size = config.options.arraysize;
final double scalar = config.scalar;
a = new double[size];
b = new double[size];
c = new double[size];
dotSum = new double[1];
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum);
}
@Override
public void initArrays() {
super.initArrays();
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
}
@Override
protected Double getSum() {
return dotSum[0];
}
@Override
public Data<Double> data() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,88 @@
package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
@SuppressWarnings("ManualArrayCopy")
private static void copy(int size, float[] a, float[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i];
}
}
private static void mul(int size, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
b[i] = scalar * c[i];
}
}
private static void add(int size, float[] a, float[] b, float[] c) {
for (@Parallel int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
private static void triad(int size, float[] a, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] + scalar * c[i];
}
}
private static void nstream(int size, float[] a, float[] b, float[] c, float scalar) {
for (@Parallel int i = 0; i < size; i++) {
a[i] = b[i] * scalar * c[i];
}
}
private static void dot_(
float[] a, float[] b, @Reduce float[] acc) { // prevent name clash with CL's dot
acc[0] = 0;
for (@Parallel int i = 0; i < a.length; i++) {
acc[0] += a[i] * b[i];
}
}
private final float[] a, b, c;
private final float[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
SpecialisedFloat(Config<Float> config) {
super(config);
final int size = config.options.arraysize;
final float scalar = config.scalar;
a = new float[size];
b = new float[size];
c = new float[size];
dotSum = new float[1];
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum);
}
@Override
public void initArrays() {
super.initArrays();
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
}
@Override
protected Float getSum() {
return dotSum[0];
}
@Override
public Data<Float> data() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -0,0 +1,42 @@
package javastream.tornadovm;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
public final class TornadoVMStreams {
private TornadoVMStreams() {}
static void xferToDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
}
}
static void xferFromDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn();
}
}
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) {
return IntStream.range(0, runtime.getNumDrivers())
.mapToObj(runtime::getDriver)
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))
.collect(Collectors.toList());
}
public static final Function<Config<Float>, JavaStream<Float>> FLOAT = SpecialisedFloat::new;
public static final Function<Config<Double>, JavaStream<Double>> DOUBLE = SpecialisedDouble::new;
}

View File

@ -0,0 +1,93 @@
package javastream;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
public class SmokeTest {
// taken from https://stackoverflow.com/a/32146095/896997
private static <T> Stream<List<T>> ofCombinations(
List<? extends Collection<T>> collections, List<T> current) {
return collections.isEmpty()
? Stream.of(current)
: collections.get(0).stream()
.flatMap(
e -> {
List<T> list = new ArrayList<>(current);
list.add(e);
return ofCombinations(collections.subList(1, collections.size()), list);
});
}
@SuppressWarnings("unused")
private static Stream<Arguments> options() {
LinkedHashMap<String, List<Integer>> impls = new LinkedHashMap<>();
impls.put("jdk-stream", Arrays.asList(0, 1));
impls.put("jdk-plain", Arrays.asList(0, 1));
// skip aparapi as none of the jdk fallbacks work correctly
// skip tornadovm as it has no jdk fallback
List<String> configs =
impls.entrySet().stream()
.flatMap(
e ->
Stream.concat(Stream.of(""), e.getValue().stream().map(i -> "--device " + i))
.map(d -> "--impl " + e.getKey() + " " + d))
.collect(Collectors.toList());
return ofCombinations(
new ArrayList<>(
Arrays.asList(
configs,
Arrays.asList("", "--csv"),
// XXX floats usually have a 1.0^-5 error which misses 10^-8
Arrays.asList("", "--float --dot-tolerance 1.0e-5"),
Arrays.asList("", "--triad-only", "--nstream-only"),
Arrays.asList("", "--mibibytes"))),
Collections.emptyList())
.map(
xs ->
Arguments.of(
xs.stream() //
.map(String::trim) //
.collect(Collectors.joining(" "))
.trim()));
}
@ParameterizedTest
@MethodSource("options")
void testIt(String args) {
String line = "--arraysize 2048 " + args;
// redirect stdout/stderr and only print if anything fails
ByteArrayOutputStream outContent = new ByteArrayOutputStream();
ByteArrayOutputStream errContent = new ByteArrayOutputStream();
PrintStream originalOut = System.out;
PrintStream originalErr = System.err;
System.setOut(new PrintStream(outContent));
System.setErr(new PrintStream(errContent));
int run = Main.run(line.split("\\s+"));
System.setOut(originalOut);
System.setErr(originalErr);
if (run != 0) {
System.out.println(outContent);
System.err.println(errContent);
Assertions.assertEquals(0, run, "`" + line + "` did not return 0");
}
}
}

View File

@ -0,0 +1,2 @@
indent = 2
margin = 100

5
src/julia/JuliaStream.jl/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
/docs/build/
/docs/Manifest.toml

View File

@ -0,0 +1,415 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"]
git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.12"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "7.0.1"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[HIP_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCmCompilerSupport_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[XSLT_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
version = "1.1.34+0"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.0.0+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,316 @@
# This file is machine-generated - editing it directly is not advised
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.4.1"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.0.2"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,547 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "hsa_rocr_jll"]
git-tree-sha1 = "d64c97447a753cfbf0158d6c7be513f34526d559"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.12"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "5e696e37e51b01ae07bd9f700afe6cbd55250bce"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.3.4"
[[CUDAKernels]]
deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b"
uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
version = "0.3.0"
[[Cassette]]
git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.3.8"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.10"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "7.0.1"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[HIP_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"]
git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab"
uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8"
version = "4.0.0+1"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[KernelAbstractions]]
deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.7.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libgcrypt_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
version = "1.8.7+0"
[[Libglvnd_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
version = "1.3.0+3"
[[Libgpg_error_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
version = "1.42.0+0"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCKernels]]
deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b"
uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
version = "0.2.0"
[[ROCmCompilerSupport_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"]
git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d"
uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17"
version = "4.0.0+1"
[[ROCmDeviceLibs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257"
uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
version = "4.0.0+0"
[[ROCmOpenCLRuntime_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"]
git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973"
uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f"
version = "4.0.0+1"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.2.12"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[XSLT_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
version = "1.1.34+0"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Xorg_libX11_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
version = "1.6.9+4"
[[Xorg_libXau_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
version = "1.0.9+4"
[[Xorg_libXdmcp_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
version = "1.1.3+4"
[[Xorg_libXext_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
version = "1.3.4+4"
[[Xorg_libpthread_stubs_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
version = "0.1.0+3"
[[Xorg_libxcb_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
version = "1.13.0+3"
[[Xorg_xorgproto_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972"
uuid = "c4d99508-4286-5418-9131-c86396af500b"
version = "2019.2.0+2"
[[Xorg_xtrans_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
version = "1.4.0+3"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "4.0.0+0"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "80e0c9940e15cfd6f1f1e9d9f3953ec4d48d3d4a"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "4.0.0+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,11 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
[compat]
julia = "1.6"

View File

@ -0,0 +1,493 @@
# This file is machine-generated - editing it directly is not advised
[[AMDGPU]]
deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9"
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
version = "0.2.7"
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.0.1"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Test"]
git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.1.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.2.1"
[[CUDAKernels]]
deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b"
uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
version = "0.3.0"
[[Cassette]]
git-tree-sha1 = "b4b1d61ebbae2bc69a45e3a6b8439b4e411bc131"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.3.8"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.3.0"
[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.10"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[Elfutils_jll]]
deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
version = "0.182.0+0"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "6.4.1"
[[GPUCompiler]]
deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.11.5"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[KernelAbstractions]]
deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.7.0"
[[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "3.9.0"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "0fb723cd8c45858c22169b2e42269e53271a6df7"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.7"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Memoize]]
deps = ["MacroTools"]
git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
version = "0.4.4"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.12.19358+0"
[[NUMA_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
version = "2.0.13+1"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[ROCKernels]]
deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b"
uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
version = "0.2.0"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Libdl", "Random", "RandomNumbers"]
git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.4.2"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.1.3"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[SPIRV_LLVM_Translator_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
version = "11.0.0+2"
[[SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0"
[[Scratch]]
deps = ["Dates"]
git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.1.0"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[Setfield]]
deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3"
uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
version = "0.7.1"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "3240808c6d463ac46f1c1cd7638375cd22abbccb"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.2.12"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[XZ_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415"
uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
version = "5.2.5+2"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[argp_standalone_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
version = "1.3.1+0"
[[fts_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
version = "1.2.7+1"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "20.3.2+0"
[[hsa_rocr_jll]]
deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d"
uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
version = "3.7.0+1"
[[hsakmt_roct_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041"
uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
version = "3.8.0+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.6712+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[obstack_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
version = "1.2.2+0"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "0.1.3"
[[oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.1.2+1"
[[oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.3.6+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,19 @@
name = "JuliaStream"
uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681"
authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
version = "3.4.0"
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"

View File

@ -0,0 +1,67 @@
JuliaStream.jl
==============
This is an implementation of BabelStream in Julia which contains the following variants:
* `PlainStream.jl` - Single threaded `for`
* `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros
* `DistributedStream.jl` - Process based parallelism with `@distributed` macros
* `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
* `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
* `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl)
* `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
### Build & Run
Prerequisites
* Julia >= 1.6+
A set of reduced dependency projects are available for the following backend and implementations:
* `AMDGPU` supports:
- `AMDGPUStream.jl`
* `CUDA` supports:
- `CUDAStream.jl`
* `oneAPI` supports:
- `oneAPIStream.jl`
* `KernelAbstractions` supports:
- `KernelAbstractionsStream.jl`
* `Threaded` supports:
- `PlainStream.jl`
- `ThreadedStream.jl`
- `DistributedStream.jl`
With Julia on path, run your selected benchmark with:
```shell
> cd JuliaStream.jl
> julia --project=<BACKEND> -e 'import Pkg; Pkg.instantiate()' # only required on first run
> julia --project=<BACKEND> src/<IMPL>Stream.jl
```
For example. to run the CUDA implementation:
```shell
> cd JuliaStream.jl
> julia --project=CUDA -e 'import Pkg; Pkg.instantiate()'
> julia --project=CUDA src/CUDAStream.jl
```
**Important:**
* Julia is 1-indexed, so N >= 1 in `--device N`.
* Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1.
* Worker count for `DistributedStream` is set with `-p <N>` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing).
* Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform.
***
Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`.
There may be instances where some packages are locked to an older version because of transitive dependency requirements.
To run the benchmark using the top-level project, run the benchmark with:
```shell
> cd JuliaStream.jl
> julia --project -e 'import Pkg; Pkg.instantiate()'
> julia --project src/<IMPL>Stream.jl
```

View File

@ -0,0 +1,31 @@
# This file is machine-generated - editing it directly is not advised
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"

View File

@ -0,0 +1,6 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
[compat]
julia = "1.6"

View File

@ -0,0 +1,319 @@
# This file is machine-generated - editing it directly is not advised
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.1"
[[ArgParse]]
deps = ["Logging", "TextWrap"]
git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
version = "1.1.4"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "bdc0937269321858ab2a4f288486cb258b9a0af7"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.3.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "727e463cfebd0c7b999bbf3e9e7e16f254b94193"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.34.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.5"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[ExprTools]]
git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.6"
[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "8fac1cf7d6ce0f2249c7acaf25d22e1e85c4a07f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.0.2"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IrrationalConstants]]
git-tree-sha1 = "f76424439413893a832026ca355fe273e93bce94"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.3.0"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9c360e5ce980b88bb31a7b086dbb19469008154b"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.10+0"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "3d682c07e6dd250ed082f883dc88aee7996bf2cc"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.0"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NEO_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "2bfc354b5684821dcc88f1e477cefd0dd03c60b5"
uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
version = "21.31.20514+0"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Parameters]]
deps = ["OrderedCollections", "UnPack"]
git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.12.2"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.2"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[SPIRV_LLVM_Translator_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
version = "11.0.0+2"
[[SPIRV_Tools_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f"
uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
version = "2021.2.0+0"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
git-tree-sha1 = "a322a9493e49c5f3a10b50df3aedaf1cdb3244b7"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "1.6.1"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextWrap]]
git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
version = "1.0.1"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.12"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[gmmlib_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd"
uuid = "09858cae-167c-5acb-9302-fddc6874d481"
version = "21.2.1+0"
[[libigc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "b30a895e7ea52991a3f984ab0302c42858d766c0"
uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
version = "1.0.8173+0"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[oneAPI]]
deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"]
git-tree-sha1 = "92e8eefdd4694597994590230ab329545804bdb3"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
version = "0.2.0"
[[oneAPI_Level_Zero_Headers_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "e1d123ff9ada6c469a1eaf57e33a74c3cb26a5a4"
uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
version = "1.2.13+0"
[[oneAPI_Level_Zero_Loader_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
git-tree-sha1 = "50124857f7e87420655929a9c8ca86749826af11"
uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
version = "1.4.1+0"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

View File

@ -0,0 +1,7 @@
[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
[compat]
julia = "1.6"

View File

@ -0,0 +1,167 @@
# AMDGPU.jl doesn't support CPU agents, so this isn't a feature-complete ROCmStream, only AMD GPUs
include("Stream.jl")
using AMDGPU
const ROCData = StreamData{T,ROCArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
function devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), "AMDGPU.jl"), sorted)
catch
# probably unsupported
String[]
end
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{ROCData{T},Nothing} where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
# XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!()
# but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand
AMDGPU.DEFAULT_AGENT[] = device[1]
selected = AMDGPU.get_default_agent()
if !silent
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
println("Kernel parameters : <<<$(arraysize),$(TBSize)>>>")
end
return (
ROCData{T}(
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::ROCData{T}, _, init::Tuple{T,T,T}) where {T}
AMDGPU.fill!(data.a, init[1])
AMDGPU.fill!(data.b, init[2])
AMDGPU.fill!(data.c, init[3])
end
function copy!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds c[i] = a[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c)
)
end
function mul!(data::ROCData{T}, _) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds b[i] = scalar * c[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar)
)
end
function add!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds c[i] = a[i] + b[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c)
)
end
function triad!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(
data.a,
data.b,
data.c,
data.scalar,
)
)
end
function nstream!(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
@inbounds a[i] += b[i] + scalar * c[i]
return
end
AMDGPU.wait(
@roc groupsize = TBSize gridsize = data.size kernel(
data.a,
data.b,
data.c,
data.scalar,
)
)
end
function dot(data::ROCData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
tb_sum = ROCDeviceArray((TBSize,), alloc_local(:reduce, T, TBSize))
local_i = workitemIdx().x
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x # only workgroupIdx starts at 1
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here
end
# then tree reduction
offset = workgroupDim().x ÷ 2
while offset > 0
sync_workgroup()
if (local_i - 1) < offset
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
end
offset ÷= 2
end
if (local_i == 1)
@inbounds partial[workgroupIdx().x] = tb_sum[local_i]
end
return
end
partial_sum = ROCArray{T}(undef, DotBlocks)
AMDGPU.wait(
@roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel(
data.a,
data.b,
data.size,
partial_sum,
)
)
return sum(partial_sum)
end
function read_data(data::ROCData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,152 @@
include("Stream.jl")
using CUDA
const CuData = StreamData{T,CuArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
function devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? String[] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices())
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{CuData{T},Nothing} where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
CUDA.device!(device[1])
selected = CUDA.device()
# show_reason is set to true here so it dumps CUDA info
# for us regardless of whether it's functional
if !CUDA.functional(true)
error("Non-functional CUDA configuration")
end
if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>")
end
return (
CuData{T}(
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
fill!(data.a, init[1])
fill!(data.b, init[2])
fill!(data.c, init[3])
end
function copy!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c)
CUDA.synchronize()
end
function mul!(data::CuData{T}, _) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds b[i] = scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar)
CUDA.synchronize()
end
function add!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i] + b[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c)
CUDA.synchronize()
end
function triad!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function nstream!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] += b[i] + scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function dot(data::CuData{T}, _) where {T}
# direct port of the reduction in CUDAStream.cu
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
tb_sum = @cuStaticSharedMem(T, TBSize)
local_i = threadIdx().x
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += blockDim().x * gridDim().x
end
# then tree reduction
offset = blockDim().x ÷ 2
while offset > 0
sync_threads()
if (local_i - 1) < offset
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
end
offset ÷= 2
end
if (local_i == 1)
@inbounds partial[blockIdx().x] = tb_sum[local_i]
end
return
end
partial_sum = CuArray{T}(undef, DotBlocks)
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
return sum(partial_sum)
end
function read_data(data::CuData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,85 @@
using Distributed
@everywhere using Pkg
@everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...`
@everywhere include("StreamData.jl")
@everywhere include("Stream.jl")
@everywhere using SharedArrays
@everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T}
function devices()::Vector{DeviceWithRepr}
return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{SharedArrayData{T},Nothing} where {T}
if !silent
println("Using max $(nworkers()) process(es) + 1 master")
end
return (
SharedArrayData{T}(
SharedArray{T}(arraysize),
SharedArray{T}(arraysize),
SharedArray{T}(arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::SharedArrayData{T}, _) where {T}
@sync @distributed for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
function dot(data::SharedArrayData{T}, _) where {T}
return @distributed (+) for i = 1:data.size
@inbounds data.a[i] * data.b[i]
end
end
function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,4 @@
module JuliaStream
end
println("Please run benchmarks directly via `julia --project src/<IMPL>Stream.jl`")

View File

@ -0,0 +1,255 @@
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
include("Stream.jl")
const CuData = StreamData{T,CUDA.CuArray{T}} where {T}
const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
@enum Backend cuda rocm cpu
struct Context
backend::Backend
device::Device
end
function list_rocm_devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), rocm), sorted)
catch
# probably unsupported
[]
end
end
function list_cuda_devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? String[] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end
function devices()::Vector{DeviceWithRepr}
cudas = list_cuda_devices()
rocms = list_rocm_devices()
cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
vcat(cpus, cudas, rocms)
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
) where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
(selected, _, backend) = device
if backend == cpu
if !silent
println("Using CPU with max $(Threads.nthreads()) threads")
end
partialsum = Vector{T}(undef, DotBlocks)
data = VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = CPU()
elseif backend == cuda
CUDA.device!(selected)
if CUDA.device() != selected
error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())")
end
if !CUDA.functional(true)
error("Non-functional CUDA configuration")
end
if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
end
partialsum = CuArray{T}(undef, DotBlocks)
data = CuData{T}(
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = CUDADevice()
elseif backend == rocm
AMDGPU.DEFAULT_AGENT[] = selected
if AMDGPU.get_default_agent() != selected
error(
"Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())",
)
end
if !silent
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
end
partialsum = ROCArray{T}(undef, DotBlocks)
data = ROCData{T}(
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
ROCArray{T}(undef, arraysize),
scalar,
arraysize,
)
backenddevice = ROCDevice()
else
error("unsupported backend $(backend)")
end
if !silent
println("Kernel parameters : <<<$(data.size),$(TBSize)>>>")
end
return (data, Context(backend, backenddevice))
end
function init_arrays!(
data::StreamData{T,C},
context::Context,
init::Tuple{T,T,T},
) where {T,C}
if context.backend == cpu
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
elseif context.backend == cuda
CUDA.fill!(data.a, init[1])
CUDA.fill!(data.b, init[2])
CUDA.fill!(data.c, init[3])
elseif context.backend == rocm
AMDGPU.fill!(data.a, init[1])
AMDGPU.fill!(data.b, init[2])
AMDGPU.fill!(data.c, init[3])
else
error("unsupported backend $(backend)")
end
end
function copy!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), c)
i = @index(Global)
@inbounds c[i] = a[i]
end
wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size))
end
function mul!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T)
i = @index(Global)
@inbounds b[i] = scalar * c[i]
end
wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size))
end
function add!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c)
i = @index(Global)
@inbounds c[i] = a[i] + b[i]
end
wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size))
end
function triad!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
i = @index(Global)
@inbounds a[i] = b[i] + (scalar * c[i])
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.c,
data.scalar,
ndrange = data.size,
),
)
end
function nstream!(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
i = @index(Global)
@inbounds a[i] += b[i] + scalar * c[i]
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.c,
data.scalar,
ndrange = data.size,
),
)
end
function dot(data::StreamData{T,C}, context::Context) where {T,C}
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T})
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
# FIXME this does not compile when targeting CPUs:
# see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
if context.backend == cpu
partial_sum = Vector{T}(undef, DotBlocks)
elseif context.backend == cuda
partial_sum = CuArray{T}(undef, DotBlocks)
elseif context.backend == rocm
partial_sum = ROCArray{T}(undef, DotBlocks)
else
error("unsupported backend $(backend)")
end
wait(
kernel(context.device, TBSize)(
data.a,
data.b,
data.size,
partial_sum,
ndrange = TBSize * DotBlocks,
),
)
return sum(partial_sum)
end
function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

View File

@ -0,0 +1,75 @@
include("Stream.jl")
function devices()::Vector{DeviceWithRepr}
return [(undef, "CPU", "Palin")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{VectorData{T},Nothing} where {T}
return (
VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
),
nothing
)
end
function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T}
for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::VectorData{T}, _) where {T}
for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
function dot(data::VectorData{T}, _) where {T}
sum = zero(T)
for i = 1:data.size
@inbounds sum += data.a[i] * data.b[i]
end
return sum
end
function read_data(data::VectorData{T}, _)::VectorData{T} where {T}
return data
end
main()

View File

@ -0,0 +1,300 @@
using ArgParse
using Parameters
using Printf
using Base: Float64, Int
include("StreamData.jl")
const VectorData = StreamData{T,Vector{T}} where {T}
const DeviceWithRepr = Tuple{Any,String,Any}
struct Timings
copy::Vector{Float64}
mul::Vector{Float64}
add::Vector{Float64}
triad::Vector{Float64}
dot::Vector{Float64}
Timings(n) = new(zeros(n), zeros(n), zeros(n), zeros(n), zeros(n))
end
@enum Benchmark All Triad Nstream
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
timings = Timings(times)
lastSum::T = 0
for i = 1:times
@inbounds timings.copy[i] = @elapsed copy!(data, context)
@inbounds timings.mul[i] = @elapsed mul!(data, context)
@inbounds timings.add[i] = @elapsed add!(data, context)
@inbounds timings.triad[i] = @elapsed triad!(data, context)
@inbounds timings.dot[i] = @elapsed lastSum = dot(data, context)
end
return (timings, lastSum)
end
function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {T,C}
return @elapsed for _ = 1:times
triad!(data, context)
end
end
function run_nstream!(
data::StreamData{T,C},
context,
times::Int,
)::Vector{Float64} where {T,C}
timings::Vector{Float64} = zeros(times)
for i = 1:times
@inbounds timings[i] = @elapsed nstream!(data, context)
end
return timings
end
function check_solutions(
data::StreamData{T,C},
times::Int,
init::Tuple{T,T,T},
benchmark::Benchmark,
dot::Union{T,Nothing},
) where {T,C}
(gold_a, gold_b, gold_c) = init
for _ = 1:times
if benchmark == All
gold_c = gold_a
gold_b = data.scalar * gold_c
gold_c = gold_a + gold_b
gold_a = gold_b + data.scalar * gold_c
elseif benchmark == Triad
gold_a = gold_b + data.scalar * gold_c
elseif benchmark == Nstream
gold_a += gold_b + data.scalar * gold_c
else
error("Unknown benchmark", benchmark)
end
end
tolerance = eps(T) * 100
function validate_xs(name::String, xs::AbstractArray{T}, from::T)
error = (map(x -> abs(x - from), xs) |> sum) / length(xs)
failed = error > tolerance
if failed
println("Validation failed on $name. Average error $error")
end
!failed
end
a_valid = validate_xs("a", data.a, gold_a)
b_valid = validate_xs("b", data.b, gold_b)
c_valid = validate_xs("c", data.c, gold_c)
dot_valid =
dot !== nothing ?
begin
gold_sum = gold_a * gold_b * data.size
error = abs((dot - gold_sum) / gold_sum)
failed = error > 1.0e-8
if failed
println(
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
)
end
!failed
end : true
a_valid && b_valid && c_valid && dot_valid
end
@with_kw mutable struct Config
list::Bool = false
device::Int = 1
numtimes::Int = 100
arraysize::Int = 33554432
float::Bool = false
triad_only::Bool = false
nstream_only::Bool = false
csv::Bool = false
mibibytes::Bool = false
end
function parse_options(given::Config)
s = ArgParseSettings()
@add_arg_table s begin
"--list"
help = "List available devices"
action = :store_true
"--device", "-d"
help = "Select device at DEVICE, NOTE: Julia is 1-indexed"
arg_type = Int
default = given.device
"--numtimes", "-n"
help = "Run the test NUMTIMES times (NUM >= 2)"
arg_type = Int
default = given.numtimes
"--arraysize", "-s"
help = "Use ARRAYSIZE elements in the array"
arg_type = Int
default = given.arraysize
"--float"
help = "Use floats (rather than doubles)"
action = :store_true
"--triad_only"
help = "Only run triad"
action = :store_true
"--nstream_only"
help = "Only run nstream"
action = :store_true
"--csv"
help = "Output as csv table"
action = :store_true
"--mibibytes"
help = "Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
action = :store_true
end
args = parse_args(s)
# surely there's a better way than doing this:
for (arg, val) in args
setproperty!(given, Symbol(arg), val)
end
end
const DefaultInit = (0.1, 0.2, 0.0)
const DefaultScalar = 0.4
const Version = "3.4.0"
function main()
config::Config = Config()
parse_options(config)
if config.list
for (i, (_,repr, impl)) in enumerate(devices())
println("[$i] ($impl) $repr")
end
exit(0)
end
ds = devices()
# TODO implement substring device match
if config.device < 1 || config.device > length(ds)
error(
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
)
else
device = ds[config.device]
end
type = config.float ? Float32 : Float64
if config.nstream_only && !config.triad_only
benchmark = Nstream
elseif !config.nstream_only && config.triad_only
benchmark = Triad
elseif !config.nstream_only && !config.triad_only
benchmark = All
elseif config.nstream_only && config.triad_only
error("Both triad and nstream are enabled, pick one or omit both to run all benchmarks")
else
error("Invalid config: $(repr(config))")
end
array_bytes = config.arraysize * sizeof(type)
total_bytes = array_bytes * 3
(mega_scale, mega_suffix, giga_scale, giga_suffix) =
!config.mibibytes ? (1.0e-6, "MB", 1.0e-9, "GB") : (2^-20, "MiB", 2^-30, "GiB")
if !config.csv
println("""BabelStream
Version: $Version
Implementation: Julia; $(PROGRAM_FILE)""")
println("Running kernels $(config.numtimes) times")
if benchmark == Triad
println("Number of elements: $(config.arraysize)")
end
println("Precision: $(config.float ? "float" : "double")")
r1 = n -> round(n; digits = 1)
println(
"Array size: $(r1(mega_scale * array_bytes)) $mega_suffix(=$(r1(giga_scale * array_bytes)) $giga_suffix)",
)
println(
"Total size: $(r1(mega_scale * total_bytes)) $mega_suffix(=$(r1(giga_scale * total_bytes)) $giga_suffix)",
)
end
function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int)
tail = Base.rest(xs)
min = Iterators.minimum(tail)
max = Iterators.maximum(tail)
avg = Iterators.sum(tail) / Iterators.length(tail)
mbps = mega_scale * total_bytes / min
if config.csv
return [
("function", name),
("num_times", config.numtimes),
("n_elements", config.arraysize),
("sizeof", total_bytes),
("max_m$( config.mibibytes ? "i" : "")bytes_per_sec", mbps),
("min_runtime", min),
("max_runtime", max),
("avg_runtime", avg),
]
else
return [
("Function", name),
("M$(config.mibibytes ? "i" : "")Bytes/sec", round(mbps; digits = 3)),
("Min (sec)", round(min; digits = 5)),
("Max", round(max; digits = 5)),
("Average", round(avg; digits = 5)),
]
end
end
function tabulate(rows::Vector{Tuple{String,Any}}...)
header = Base.first(rows)
padding = config.csv ? 0 : 12
sep = config.csv ? "," : ""
map(x -> rpad(x[1], padding), header) |> x -> join(x, sep) |> println
for row in rows
map(x -> rpad(x[2], padding), row) |> x -> join(x, sep) |> println
end
end
init::Tuple{type,type,type} = DefaultInit
scalar::type = DefaultScalar
GC.enable(false)
(data, context) = make_stream(config.arraysize, scalar, device, config.csv)
init_arrays!(data, context, init)
if benchmark == All
(timings, sum) = run_all!(data, context, config.numtimes)
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum)
tabulate(
mk_row(timings.copy, "Copy", 2 * array_bytes),
mk_row(timings.mul, "Mul", 2 * array_bytes),
mk_row(timings.add, "Add", 3 * array_bytes),
mk_row(timings.triad, "Triad", 3 * array_bytes),
mk_row(timings.dot, "Dot", 2 * array_bytes),
)
elseif benchmark == Nstream
timings = run_nstream!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
elseif benchmark == Triad
elapsed = run_triad!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
total_bytes = 3 * array_bytes * config.numtimes
bandwidth = mega_scale * (total_bytes / elapsed)
println("Runtime (seconds): $(round(elapsed; digits=5))")
println("Bandwidth ($giga_suffix/s): $(round(bandwidth; digits=3)) ")
else
error("Bad benchmark $(benchmark)")
end
GC.enable(true)
if !valid
exit(1)
end
end

View File

@ -0,0 +1,7 @@
struct StreamData{T,C<:AbstractArray{T}}
a::C
b::C
c::C
scalar::T
size::Int
end

View File

@ -0,0 +1,112 @@
include("Stream.jl")
function devices()::Vector{DeviceWithRepr}
return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")]
end
function make_stream(
arraysize::Int,
scalar::T,
_::DeviceWithRepr,
silent::Bool,
)::Tuple{VectorData{T},Nothing} where {T}
if !silent
println("Using max $(Threads.nthreads()) threads")
end
return (
VectorData{T}(
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
Vector{T}(undef, arraysize),
scalar,
arraysize,
),
nothing
)
end
function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = init[1]
@inbounds data.b[i] = init[2]
@inbounds data.c[i] = init[3]
end
end
function copy!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.c[i] = data.a[i]
end
end
function mul!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.b[i] = data.scalar * data.c[i]
end
end
function add!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.c[i] = data.a[i] + data.b[i]
end
end
function triad!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
end
end
function nstream!(data::VectorData{T}, _) where {T}
Threads.@threads for i = 1:data.size
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
end
end
# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc
function static_par_ranged(f::Function, range::Int, n::Int)
stride = range ÷ n
rem = range % n
strides = map(0:n) do i
width = stride + (i < rem ? 1 : 0)
offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem))
(offset, width)
end
ccall(:jl_enter_threaded_region, Cvoid, ())
try
foreach(wait, map(1:n) do group
(offset, size) = strides[group]
task = Task(() -> f(group, offset+1, offset+size))
task.sticky = true
ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group
schedule(task)
end)
finally
ccall(:jl_exit_threaded_region, Cvoid, ())
end
end
function dot(data::VectorData{T}, _) where {T}
partial = Vector{T}(undef, Threads.nthreads())
static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx
acc = zero(T)
@simd for i = startidx:endidx
@inbounds acc += data.a[i] * data.b[i]
end
@inbounds partial[group] = acc
end
return sum(partial)
# This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall
# and inhibited vectorisation from the lack of @simd
# partial = zeros(T, Threads.nthreads())
# Threads.@threads for i = 1:data.size
# @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i])
# end
# return sum(partial)
end
function read_data(data::VectorData{T}, _)::VectorData{T} where {T}
return data
end
main()

View File

@ -0,0 +1,170 @@
using Base.Iterators: println
using Base.Iterators: println
using Printf: Iterators
include("Stream.jl")
using oneAPI
const oneData = StreamData{T,oneArray{T}} where {T}
const DotWGSize = 256::Int
function devices()::Vector{DeviceWithRepr}
all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect
map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all)
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{oneData{T},Int} where {T}
oneAPI.allowscalar(false)
oneAPI.device!(device[1])
props = oneL0.compute_properties(oneAPI.device())
groupsize = min(props.maxTotalGroupSize, arraysize)
if arraysize % groupsize != 0
error("arraysize ($(arraysize)) must be divisible by $(groupsize)!")
end
if !silent
println("Using L0 device: $(repr("text/plain",device[1]))")
println("Kernel parameters : <<<$(arraysize),$(groupsize)>>>")
end
return (
oneData{T}(
oneArray{T}(undef, arraysize),
oneArray{T}(undef, arraysize),
oneArray{T}(undef, arraysize),
scalar,
arraysize,
),
groupsize,
)
end
function init_arrays!(data::oneData{T}, _, init::Tuple{T,T,T}) where {T}
oneAPI.fill!(data.a, init[1])
oneAPI.fill!(data.b, init[2])
oneAPI.fill!(data.c, init[3])
end
function copy!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = get_global_id()
@inbounds c[i] = a[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.c,
)
oneAPI.synchronize()
end
function mul!(data::oneData{T}, groupsize::Int) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds b[i] = scalar * c[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function add!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = get_global_id()
@inbounds c[i] = a[i] + b[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
)
oneAPI.synchronize()
end
function triad!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function nstream!(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = get_global_id()
@inbounds a[i] += b[i] + scalar * c[i]
return
end
@oneapi items = groupsize groups = data.size ÷ groupsize kernel( #
data.a,
data.b,
data.c,
data.scalar,
)
oneAPI.synchronize()
end
function dot(data::oneData{T}, groupsize::Int) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
wg_sum = @LocalMemory(T, (DotWGSize,))
li = get_local_id()
@inbounds wg_sum[li] = 0.0
# do dot first
i = get_global_id()
while i <= size
@inbounds wg_sum[li] += a[i] * b[i]
i += get_global_size()
end
# then tree reduction
offset = get_local_size() ÷ 2
while offset > 0
barrier()
if li <= offset
@inbounds wg_sum[li] += wg_sum[li+offset]
end
offset ÷= 2
end
if li == 1
@inbounds partial[get_group_id()] = wg_sum[li]
end
return
end
partial_sum = oneArray{T}(undef, groupsize)
@oneapi items = groupsize groups = DotWGSize kernel(
data.a,
data.b,
data.size,
partial_sum,
)
oneAPI.synchronize()
return sum(partial_sum)
end
function read_data(data::oneData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()

Some files were not shown because too many files have changed in this diff Show More