Add oneDPL for std models

2022-07-28 15:03:26 +01:00 · 2022-07-28 15:03:26 +01:00 · dfb4eb06b2
commit dfb4eb06b2
parent 5197a4e561
13 changed files with 269 additions and 65 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -76,13 +76,15 @@ option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on m
                model on how this is used." OFF)

 if (USE_TBB)
-    include(FetchContent)
    FetchContent_Declare(
            TBB
            GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
            GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8
    )
-    # Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL
+    # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+    set(TBB_STRICT OFF)
+    # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
    FetchContent_GetProperties(TBB)
    if (NOT TBB_POPULATED)
        FetchContent_Populate(TBB)
@ -90,6 +92,25 @@ if (USE_TBB)
    endif ()
 endif ()

+option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that
+                don't explicitly link against DPL is a no-op, see description of your selected
+                model on how this is used." OFF)
+
+if (USE_ONEDPL)
+    FetchContent_Declare(
+            oneDPL
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
+            GIT_TAG oneDPL-2021.7.0-release
+    )
+    # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL
+    FetchContent_GetProperties(oneDPL)
+    if (NOT oneDPL_POPULATED)
+        FetchContent_Populate(oneDPL)
+        add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL)
+    endif ()
+endif()
+
+
 # include our macros
 include(cmake/register_models.cmake)

@ -170,6 +191,7 @@ include_directories(src)
 add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp)
 target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES})
 target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS})
+target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES})

 if (CXX_EXTRA_LIBRARIES)
    target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES})
--- a/cmake/register_models.cmake
+++ b/cmake/register_models.cmake
@ -71,6 +71,10 @@ macro(register_definitions)
    list(APPEND IMPL_DEFINITIONS ${ARGN})
 endmacro()

+macro(register_directories)
+    list(APPEND IMPL_DIRECTORIES ${ARGN})
+endmacro()
+
 macro(register_flag_required NAME DESCRIPTION)
    list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "")
 endmacro()
--- a/cmake/shim_onedpl.cmake
+++ b/cmake/shim_onedpl.cmake
@ -0,0 +1,27 @@
+
+
+if (USE_ONEDPL)
+    #        # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html
+    #        # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation
+    #        register_definitions(
+    #                PSTL_USE_PARALLEL_POLICIES=0
+    #                _GLIBCXX_USE_TBB_PAR_BACKEND=0
+    #        )
+    register_definitions(USE_ONEDPL)
+    if (USE_ONEDPL STREQUAL "TBB")
+        register_definitions(ONEDPL_USE_TBB_BACKEND=1)
+        # TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here
+    elseif (USE_ONEDPL STREQUAL "OPENMP")
+        register_definitions(ONEDPL_USE_OPENMP_BACKEND=1)
+        # Link OpenMP via CMAKE
+        find_package(OpenMP REQUIRED)
+        register_link_library(OpenMP::OpenMP_CXX)
+    elseif (USE_ONEDPL STREQUAL "SYCL")
+        register_definitions(ONEDPL_USE_DPCPP_BACKEND=1)
+        # This needs a SYCL compiler, will fail if CXX doesn't SYCL2020
+        register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl)
+    else ()
+        message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.")
+    endif ()
+    register_directories(ANY ${onedpl_SOURCE_DIR}/include)
+endif ()
--- a/src/dpl_shim.h
+++ b/src/dpl_shim.h
@ -0,0 +1,75 @@
+#pragma once
+
+#include <cstdlib>
+#include <cstddef>
+
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+#ifdef USE_ONEDPL
+
+// oneDPL C++17 PSTL
+
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/numeric>
+
+#ifdef ONEDPL_USE_DPCPP_BACKEND
+
+#include <CL/sycl.hpp>
+
+const static auto exe_policy = oneapi::dpl::execution::device_policy<>{
+        oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{})
+};
+
+template<typename T> using Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
+
+template<class T>
+constexpr Allocator<T> alloc_vec() { return {exe_policy.queue()}; };
+
+template<typename T>
+T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue()); }
+
+template<typename T>
+void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
+
+#else
+
+// auto exe_policy = dpl::execution::seq;
+// auto exe_policy = dpl::execution::par;
+static constexpr auto exe_policy = dpl::execution::par_unseq;
+#define USE_STD_PTR_ALLOC_DEALLOC
+
+#endif
+
+#else
+
+// Normal C++17 PSTL
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+static constexpr auto exe_policy = std::execution::par_unseq;
+#define USE_STD_PTR_ALLOC_DEALLOC
+
+
+#endif
+
+#ifdef USE_STD_PTR_ALLOC_DEALLOC
+
+template<typename T> using Allocator = std::allocator<T>;
+
+template<class T>
+constexpr Allocator<T> alloc_vec() { return {}; };
+
+template<typename T>
+T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); }
+
+template<typename T>
+void dealloc_raw(T *ptr) { free(ptr); }
+
+#endif
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@ -6,14 +6,6 @@

 #include "STDDataStream.h"

-#include <algorithm>
-#include <execution>
-#include <numeric>
-
-#ifndef ALIGNMENT
-#define ALIGNMENT (2*1024*1024) // 2MB
-#endif
-
 #ifdef USE_VECTOR
 #define BEGIN(x) (x).begin()
 #define END(x) (x).end()
@ -22,23 +14,39 @@
 #define END(x) ((x) + array_size)
 #endif

-// There are three execution policies:
-// auto exe_policy = std::execution::seq;
-// auto exe_policy = std::execution::par;
-constexpr auto exe_policy = std::execution::par_unseq;
-
-
 template <class T>
 STDDataStream<T>::STDDataStream(const int ARRAY_SIZE, int device)
  noexcept : array_size{ARRAY_SIZE},
 #ifdef USE_VECTOR
-  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+  a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
 #else
-  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
-{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
+{
+    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+#if USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+#elif defined(ONEDPL_USE_TBB_BACKEND)
+    std::cout << "TBB " TBB_VERSION_STRING;
+#elif defined(ONEDPL_USE_OPENMP_BACKEND)
+    std::cout << "OpenMP";
+#else
+    std::cout << "Default";
+#endif
+    std::cout << std::endl;
+#endif
+}
+
+template<class T>
+STDDataStream<T>::~STDDataStream() {
+#ifndef USE_VECTOR
+    dealloc_raw(a);
+    dealloc_raw(b);
+    dealloc_raw(c);
+#endif
+}

 template <class T>
 void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@ -5,6 +5,7 @@
 // source code

 #pragma once
+#include "dpl_shim.h"

 #include <iostream>
 #include <stdexcept>
@ -22,7 +23,7 @@ class STDDataStream : public Stream<T>

    // Device side pointers
 #ifdef USE_VECTOR
-    std::vector<T> a, b, c;
+    std::vector<T, Allocator<T>> a, b, c;
 #else
    T *a, *b, *c;
 #endif
@ -30,7 +31,7 @@ class STDDataStream : public Stream<T>

  public:
    STDDataStream(const int, int) noexcept;
-    ~STDDataStream() = default;
+    ~STDDataStream();

    virtual void copy() override;
    virtual void add() override;
--- a/src/std-data/model.cmake
+++ b/src/std-data/model.cmake
@ -27,9 +27,23 @@ register_flag_optional(USE_TBB
        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
        "OFF")

+register_flag_optional(USE_ONEDPL
+        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
+
+        Possible values are:
+          OPENMP - Implements policies using OpenMP.
+                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
+          TBB    - Implements policies using TBB.
+                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
+          SYCL   - Implements policies through SYCL2020.
+                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
+        "OFF")
+
 macro(setup)
    set(CMAKE_CXX_STANDARD 17)

+    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
+
    if (NVHPC_OFFLOAD)
        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
        # propagate flags to linker so that it links with the gpu stuff as well
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@ -22,22 +22,39 @@
 #define END(x) ((x) + array_size)
 #endif

-// There are three execution policies:
-// auto exe_policy = std::execution::seq;
-// auto exe_policy = std::execution::par;
-constexpr auto exe_policy = std::execution::par_unseq;
-
 template <class T>
 STDIndicesStream<T>::STDIndicesStream(const int ARRAY_SIZE, int device)
 noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
 #ifdef USE_VECTOR
-  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+  a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
 #else
-  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
-{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
+{
+    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+#if USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+#elif defined(ONEDPL_USE_TBB_BACKEND)
+    std::cout << "TBB " TBB_VERSION_STRING;
+#elif defined(ONEDPL_USE_OPENMP_BACKEND)
+    std::cout << "OpenMP";
+#else
+    std::cout << "Default";
+#endif
+    std::cout << std::endl;
+#endif
+}
+
+template<class T>
+STDIndicesStream<T>::~STDIndicesStream() {
+#ifndef USE_VECTOR
+    dealloc_raw(a);
+    dealloc_raw(b);
+    dealloc_raw(c);
+#endif
+}

 template <class T>
 void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@ -5,19 +5,14 @@
 // source code

 #pragma once
+#include "dpl_shim.h"

 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"

-#ifdef USE_SPAN
-#include <span>
-#endif
-
-
 #define IMPLEMENTATION_STRING "STD (index-oriented)"

-
 // A lightweight counting iterator which will be used by the STL algorithms
 // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
 // implementation doesn't target
@ -78,7 +73,7 @@ class STDIndicesStream : public Stream<T>

    // Device side pointers
 #ifdef USE_VECTOR
-    std::vector<T> a, b, c;
+    std::vector<T, Allocator<T>> a, b, c;
 #else
    T *a, *b, *c;
 #endif
@ -86,7 +81,7 @@ class STDIndicesStream : public Stream<T>

  public:
    STDIndicesStream(const int, int) noexcept;
-    ~STDIndicesStream() = default;
+    ~STDIndicesStream();

    virtual void copy() override;
    virtual void add() override;
--- a/src/std-indices/model.cmake
+++ b/src/std-indices/model.cmake
@ -27,10 +27,21 @@ register_flag_optional(USE_TBB
        "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
        "OFF")

+register_flag_optional(USE_ONEDPL
+        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
+
+        Possible values are:
+          OPENMP - Implements policies using OpenMP.
+                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
+          TBB    - Implements policies using TBB.
+                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
+          SYCL   - Implements policies through SYCL2020.
+                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
+        "OFF")

 macro(setup)
    set(CMAKE_CXX_STANDARD 17)
-
+    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
    if (NVHPC_OFFLOAD)
        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
        # propagate flags to linker so that it links with the gpu stuff as well
--- a/src/std-ranges/STDRangesStream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@ -22,22 +22,39 @@
 #define END(x) ((x) + array_size)
 #endif

-// There are three execution policies:
-// auto exe_policy = std::execution::seq;
-// auto exe_policy = std::execution::par;
-constexpr auto exe_policy = std::execution::par_unseq;
-
 template <class T>
 STDRangesStream<T>::STDRangesStream(const int ARRAY_SIZE, int device)
- : array_size{ARRAY_SIZE},
+noexcept : array_size{ARRAY_SIZE},
 #ifdef USE_VECTOR
-  a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+  a(ARRAY_SIZE, alloc_vec<T>()), b(ARRAY_SIZE, alloc_vec<T>()), c(ARRAY_SIZE, alloc_vec<T>())
 #else
-  a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-  c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
 #endif
-{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; }
+{
+    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+#if USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+#if defined(ONEDPL_USE_DPCPP_BACKEND)
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+#elif defined(ONEDPL_USE_TBB_BACKEND)
+    std::cout << "TBB " TBB_VERSION_STRING;
+#elif defined(ONEDPL_USE_OPENMP_BACKEND)
+    std::cout << "OpenMP";
+#else
+    std::cout << "Default";
+#endif
+    std::cout << std::endl;
+#endif
+}
+
+template<class T>
+STDRangesStream<T>::~STDRangesStream() {
+#ifndef USE_VECTOR
+    dealloc_raw(a);
+    dealloc_raw(b);
+    dealloc_raw(c);
+#endif
+}

 template <class T>
 void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
--- a/src/std-ranges/STDRangesStream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@ -5,10 +5,10 @@
 // source code

 #pragma once
+#include "dpl_shim.h"

 #include <iostream>
-#include <vector>
-
+#include <stdexcept>
 #include "Stream.h"

 #define IMPLEMENTATION_STRING "STD C++ ranges"
@ -22,14 +22,14 @@ class STDRangesStream : public Stream<T>

    // Device side pointers
 #ifdef USE_VECTOR
-    std::vector<T> a, b, c;
+    std::vector<T, Allocator<T>> a, b, c;
 #else
    T *a, *b, *c;
 #endif

  public:
-    STDRangesStream(const int, int);
-    ~STDRangesStream() = default;
+    STDRangesStream(const int, int) noexcept;
+    ~STDRangesStream();

    virtual void copy() override;
    virtual void add() override;
--- a/src/std-ranges/model.cmake
+++ b/src/std-ranges/model.cmake
@ -11,6 +11,18 @@ register_flag_optional(USE_TBB
        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
        "OFF")

+register_flag_optional(USE_ONEDPL
+        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
+
+        Possible values are:
+          OPENMP - Implements policies using OpenMP.
+                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
+          TBB    - Implements policies using TBB.
+                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
+          SYCL   - Implements policies through SYCL2020.
+                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
+        "OFF")
+
 macro(setup)

    # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
@ -21,6 +33,7 @@ macro(setup)
    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
    # and append our own:
    register_append_cxx_flags(ANY -std=c++2a)
+    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake)
    if (USE_VECTOR)
        register_definitions(USE_VECTOR)
    endif ()