From 3f7bb631e18dd80b7798c7bb8218b041197b1975 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 5 Sep 2023 03:29:16 +0100 Subject: [PATCH] Initial SYCL2020 USM implementation --- CHANGELOG.md | 3 + CMakeLists.txt | 9 +- .../SYCLStream2020.cpp | 9 +- .../SYCLStream2020.h | 2 +- src/sycl2020-acc/model.cmake | 91 ++++++ src/sycl2020-usm/SYCLStream2020.cpp | 269 ++++++++++++++++++ src/sycl2020-usm/SYCLStream2020.h | 54 ++++ src/{sycl2020 => sycl2020-usm}/model.cmake | 3 - 8 files changed, 433 insertions(+), 7 deletions(-) rename src/{sycl2020 => sycl2020-acc}/SYCLStream2020.cpp (94%) rename src/{sycl2020 => sycl2020-acc}/SYCLStream2020.h (95%) create mode 100644 src/sycl2020-acc/model.cmake create mode 100644 src/sycl2020-usm/SYCLStream2020.cpp create mode 100644 src/sycl2020-usm/SYCLStream2020.h rename src/{sycl2020 => sycl2020-usm}/model.cmake (96%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 371e241..6bf53a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ### Added - Ability to build Kokkos and RAJA versions against existing packages. - Thrust managed memory. +- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. ### Changed - RAJA CUDA CMake build issues resolved. @@ -13,6 +14,8 @@ All notable changes to this project will be documented in this file. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. - Updates to the HIP kernels and API usage. - Number of thread-blocks in CUDA dot kernel implementation changed to 1024. +- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL. + ## [v4.0] - 2021-12-22 diff --git a/CMakeLists.txt b/CMakeLists.txt index 89b3a78..da112a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,7 +145,8 @@ register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) -register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) @@ -162,6 +163,12 @@ else () message(STATUS "Selected model : ${MODEL}") endif () +if (MODEL STREQUAL "sycl2020") + message(FATAL_ERROR " + Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available. + Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM") +endif () + # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL load_model(${MODEL}) diff --git a/src/sycl2020/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp similarity index 94% rename from src/sycl2020/SYCLStream2020.cpp rename to src/sycl2020-acc/SYCLStream2020.cpp index 17a5ab5..f88cbbb 100644 --- a/src/sycl2020/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -164,8 +164,13 @@ T SYCLStream::dot() sycl::accessor kb {d_b, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity{}), + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), +#else + sycl::reduction(d_sum, cgh sycl::plus()), +#endif [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h similarity index 95% rename from src/sycl2020/SYCLStream2020.h rename to src/sycl2020-acc/SYCLStream2020.h index 7481d16..caaeae9 100644 --- a/src/sycl2020/SYCLStream2020.h +++ b/src/sycl2020-acc/SYCLStream2020.h @@ -14,7 +14,7 @@ #include -#define IMPLEMENTATION_STRING "SYCL 2020" +#define IMPLEMENTATION_STRING "SYCL2020 accessors" template class SYCLStream : public Stream diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake new file mode 100644 index 0000000..0cd8c92 --- /dev/null +++ b/src/sycl2020-acc/model.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" + "c++") + +register_flag_required(SYCL_COMPILER + "Compile using the specified SYCL compiler implementation + Supported values are + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) + DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") + +register_flag_optional(SYCL_COMPILER_DIR + "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. + HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + "") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + + if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + + + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + + if (NOT EXISTS "${hipSYCL_DIR}") + message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${hipSYCL_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + endif () + + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(hipSYCL CONFIG REQUIRED) + message(STATUS "ok") + + elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") + + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) + + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") + + register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) + # ComputeCpp needs OpenCL + find_package(ComputeCpp REQUIRED) + + # this must come after FindComputeCpp (!) + set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) + + elseif (${SYCL_COMPILER} STREQUAL "DPCPP") + set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + else () + message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") + endif () + +endmacro() + + +macro(setup_target NAME) + if ( + (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR + (${SYCL_COMPILER} STREQUAL "HIPSYCL")) + # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + endif () +endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp new file mode 100644 index 0000000..21a8a47 --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -0,0 +1,269 @@ + +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream2020.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + a = sycl::malloc_shared(array_size, *queue); + b = sycl::malloc_shared(array_size, *queue); + c = sycl::malloc_shared(array_size, *queue); + sum = sycl::malloc_shared(1, *queue); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + +template +SYCLStream::~SYCLStream() { + sycl::free(a, *queue); + sycl::free(b, *queue); + sycl::free(c, *queue); + sycl::free(sum, *queue); +} + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx) + { + c[idx] = a[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx) + { + b[idx] = scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx) + { + c[idx] = a[idx] + b[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] += b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(sum, sycl::plus()), +#else + sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), +#endif + [a = this->a, b = this->b](sycl::id<1> idx, auto& sum) + { + sum += a[idx] * b[idx]; + }); + + }); + queue->wait(); + return *sum; +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = initA; + b[idx] = initB; + c[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h new file mode 100644 index 0000000..0b2dc0d --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL2020 USM" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + T *a{}; + T *b{}; + T *c{}; + T *sum{}; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/src/sycl2020/model.cmake b/src/sycl2020-usm/model.cmake similarity index 96% rename from src/sycl2020/model.cmake rename to src/sycl2020-usm/model.cmake index 6a517c1..81ad9d7 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020-usm/model.cmake @@ -19,9 +19,6 @@ register_flag_optional(SYCL_COMPILER_DIR HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") -register_flag_optional(OpenCL_LIBRARY - "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" - "${OpenCL_LIBRARY}") macro(setup) set(CMAKE_CXX_STANDARD 17)