Add oneDPL for std models

This commit is contained in:
Tom Lin 2022-07-28 15:03:26 +01:00
parent 5197a4e561
commit dfb4eb06b2
13 changed files with 269 additions and 65 deletions

View File

@ -76,13 +76,15 @@ option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on m
model on how this is used." OFF)
if (USE_TBB)
include(FetchContent)
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8
)
# Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL
# Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
set(TBB_STRICT OFF)
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(TBB)
if (NOT TBB_POPULATED)
FetchContent_Populate(TBB)
@ -90,6 +92,25 @@ if (USE_TBB)
endif ()
endif ()
option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that
don't explicitly link against DPL is a no-op, see description of your selected
model on how this is used." OFF)
if (USE_ONEDPL)
FetchContent_Declare(
oneDPL
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
GIT_TAG oneDPL-2021.7.0-release
)
# Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
FetchContent_GetProperties(oneDPL)
if (NOT oneDPL_POPULATED)
FetchContent_Populate(oneDPL)
add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL)
endif ()
endif()
# include our macros
include(cmake/register_models.cmake)
@ -170,6 +191,7 @@ include_directories(src)
add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES})
if (CXX_EXTRA_LIBRARIES)
target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})

View File

@ -71,6 +71,10 @@ macro(register_definitions)
list(APPEND IMPL_DEFINITIONS ${ARGN})
endmacro()
macro(register_directories)
list(APPEND IMPL_DIRECTORIES ${ARGN})
endmacro()
macro(register_flag_required NAME DESCRIPTION)
list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "")
endmacro()

27
cmake/shim_onedpl.cmake Normal file
View File

@ -0,0 +1,27 @@
if (USE_ONEDPL)
# # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
# # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
# register_definitions(
# PSTL_USE_PARALLEL_POLICIES=0
# _GLIBCXX_USE_TBB_PAR_BACKEND=0
# )
register_definitions(USE_ONEDPL)
if (USE_ONEDPL STREQUAL "TBB")
register_definitions(ONEDPL_USE_TBB_BACKEND=1)
# TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here
elseif (USE_ONEDPL STREQUAL "OPENMP")
register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
# Link OpenMP via CMAKE
find_package(OpenMP REQUIRED)
register_link_library(OpenMP::OpenMP_CXX)
elseif (USE_ONEDPL STREQUAL "SYCL")
register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
# This needs a SYCL compiler, will fail if CXX doesn't SYCL2020
register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl)
else ()
message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.")
endif ()
register_directories(ANY ${onedpl_SOURCE_DIR}/include)
endif ()

75
src/dpl_shim.h Normal file
View File

@ -0,0 +1,75 @@
#pragma once
#include <cstdlib>
#include <cstddef>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_ONEDPL
// oneDPL C++17 PSTL
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/numeric>
#ifdef ONEDPL_USE_DPCPP_BACKEND
#include <CL/sycl.hpp>
const static auto exe_policy = oneapi::dpl::execution::device_policy<>{
oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})
};
template<typename T> using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
template<class T>
constexpr Allocator<T> alloc_vec() { return {exe_policy.queue()}; };
template<typename T>
T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue()); }
template<typename T>
void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
#else
// auto exe_policy = dpl::execution::seq;
// auto exe_policy = dpl::execution::par;
static constexpr auto exe_policy = dpl::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#else
// Normal C++17 PSTL
#include <algorithm>
#include <execution>
#include <numeric>
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
static constexpr auto exe_policy = std::execution::par_unseq;
#define USE_STD_PTR_ALLOC_DEALLOC
#endif
#ifdef USE_STD_PTR_ALLOC_DEALLOC
template<typename T> using Allocator = std::allocator<T>;
template<class T>
constexpr Allocator<T> alloc_vec() { return {}; };
template<typename T>
T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); }
template<typename T>
void dealloc_raw(T *ptr) { free(ptr); }
#endif

View File

@ -6,14 +6,6 @@
#include "STDDataStream.h"
#include <algorithm>
#include <execution>
#include <numeric>
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
#ifdef USE_VECTOR
#define BEGIN(x) (x).begin()
#define END(x) (x).end()
@ -22,23 +14,39 @@
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#if USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if defined(ONEDPL_USE_DPCPP_BACKEND)
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif defined(ONEDPL_USE_TBB_BACKEND)
std::cout << "TBB " TBB_VERSION_STRING;
#elif defined(ONEDPL_USE_OPENMP_BACKEND)
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDDataStream<T>::~STDDataStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
}
template <class T>
void STDDataStream<T>::init_arrays(T initA, T initB, T initC)

View File

@ -5,6 +5,7 @@
// source code
#pragma once
#include "dpl_shim.h"
#include <iostream>
#include <stdexcept>
@ -22,7 +23,7 @@ class STDDataStream : public Stream<T>
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
std::vector<T, Allocator<T>> a, b, c;
#else
T *a, *b, *c;
#endif
@ -30,7 +31,7 @@ class STDDataStream : public Stream<T>
public:
STDDataStream(const int, int) noexcept;
~STDDataStream() = default;
~STDDataStream();
virtual void copy() override;
virtual void add() override;

View File

@ -27,9 +27,23 @@ register_flag_optional(USE_TBB
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
SYCL - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup)
set(CMAKE_CXX_STANDARD 17)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well

View File

@ -22,22 +22,39 @@
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#if USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if defined(ONEDPL_USE_DPCPP_BACKEND)
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif defined(ONEDPL_USE_TBB_BACKEND)
std::cout << "TBB " TBB_VERSION_STRING;
#elif defined(ONEDPL_USE_OPENMP_BACKEND)
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDIndicesStream<T>::~STDIndicesStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
}
template <class T>
void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)

View File

@ -5,19 +5,14 @@
// source code
#pragma once
#include "dpl_shim.h"
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#ifdef USE_SPAN
#include <span>
#endif
#define IMPLEMENTATION_STRING "STD (index-oriented)"
// A lightweight counting iterator which will be used by the STL algorithms
// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
// implementation doesn't target
@ -78,7 +73,7 @@ class STDIndicesStream : public Stream<T>
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
std::vector<T, Allocator<T>> a, b, c;
#else
T *a, *b, *c;
#endif
@ -86,7 +81,7 @@ class STDIndicesStream : public Stream<T>
public:
STDIndicesStream(const int, int) noexcept;
~STDIndicesStream() = default;
~STDIndicesStream();
virtual void copy() override;
virtual void add() override;

View File

@ -27,10 +27,21 @@ register_flag_optional(USE_TBB
"Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
SYCL - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup)
set(CMAKE_CXX_STANDARD 17)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
if (NVHPC_OFFLOAD)
set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
# propagate flags to linker so that it links with the gpu stuff as well

View File

@ -22,22 +22,39 @@
#define END(x) ((x) + array_size)
#endif
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
constexpr auto exe_policy = std::execution::par_unseq;
template <class T>
STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE},
noexcept : array_size{ARRAY_SIZE},
#ifdef USE_VECTOR
a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
#else
a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
#endif
{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
{
std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
#if USE_ONEDPL
std::cout << "Using oneDPL backend: ";
#if defined(ONEDPL_USE_DPCPP_BACKEND)
std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
#elif defined(ONEDPL_USE_TBB_BACKEND)
std::cout << "TBB " TBB_VERSION_STRING;
#elif defined(ONEDPL_USE_OPENMP_BACKEND)
std::cout << "OpenMP";
#else
std::cout << "Default";
#endif
std::cout << std::endl;
#endif
}
template<class T>
STDRangesStream<T>::~STDRangesStream() {
#ifndef USE_VECTOR
dealloc_raw(a);
dealloc_raw(b);
dealloc_raw(c);
#endif
}
template <class T>
void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)

View File

@ -5,10 +5,10 @@
// source code
#pragma once
#include "dpl_shim.h"
#include <iostream>
#include <vector>
#include <stdexcept>
#include "Stream.h"
#define IMPLEMENTATION_STRING "STD C++ ranges"
@ -22,14 +22,14 @@ class STDRangesStream : public Stream<T>
// Device side pointers
#ifdef USE_VECTOR
std::vector<T> a, b, c;
std::vector<T, Allocator<T>> a, b, c;
#else
T *a, *b, *c;
#endif
public:
STDRangesStream(const int, int);
~STDRangesStream() = default;
STDRangesStream(const int, int) noexcept;
~STDRangesStream();
virtual void copy() override;
virtual void add() override;

View File

@ -11,6 +11,18 @@ register_flag_optional(USE_TBB
"No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
"OFF")
register_flag_optional(USE_ONEDPL
"Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
Possible values are:
OPENMP - Implements policies using OpenMP.
CMake will handle any flags needed to enable OpenMP if the compiler supports it.
TBB - Implements policies using TBB.
TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
SYCL - Implements policies through SYCL2020.
This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
"OFF")
macro(setup)
# TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
@ -21,6 +33,7 @@ macro(setup)
unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
# and append our own:
register_append_cxx_flags(ANY -std=c++2a)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
if (USE_VECTOR)
register_definitions(USE_VECTOR)
endif ()