Make partitioner a compile option

Inline all abstractions Add intel compilers for Make
2021-06-03 13:43:12 +01:00 · 2021-06-03 13:43:12 +01:00 · 0e3727d8f8
commit 0e3727d8f8
parent 0867115d8d
4 changed files with 113 additions and 90 deletions
--- a/TBB.cmake
+++ b/TBB.cmake
@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR
         If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 
        "")

+
+register_flag_optional(PARTITIONER
+        "Partitioner specifies how a loop template should partition its work among threads.
+         Possible values are:
+            AUTO     - Optimize range subdivision based on work-stealing events.
+            AFFINITY - Proportional splitting that optimizes for cache affinity.
+            STATIC   - Distribute work uniformly with no additional load balancing.
+            SIMPLE   - Recursively split its range until it cannot be further subdivided.
+            See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
+        "AUTO")
+
 macro(setup)
    if(ONE_TBB_DIR)
        set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
-        # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years
+        # docs on Intel's website refers to TBB_DIR which is not correct
    endif()
    

-    set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
    # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
    find_package(TBB REQUIRED)
    register_link_library(TBB::tbb)
+    register_definitions(PARTITIONER_${PARTITIONER})
 endmacro()
--- a/TBB.make
+++ b/TBB.make
@ -3,24 +3,52 @@ ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
-  GNU
+  GNU INTEL INTEL_LEGACY

 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif

-TBB_LIB=

-COMPILER_GNU = g++
+CXX_GNU          = g++
+CXX_INTEL        = icpx
+CXX_INTEL_LEGACY = icpc
 CXX = $(COMPILER_$(COMPILER))

-FLAGS_GNU = -O3 -std=c++14 -march=native
-CXXFLAGS = $(FLAGS_$(COMPILER))
+CXXFLAGS_GNU          = -march=native
+CXXFLAGS_INTEL        = -march=native
+CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
+
+
+
+ifndef PARTITIONER
+define partitioner_help
+Set PARTITIONER to select TBB's partitioner.
+Partitioner specifies how a loop template should partition its work among threads.
+
+Available options:
+  AUTO     - Optimize range subdivision based on work-stealing events.
+  AFFINITY - Proportional splitting that optimizes for cache affinity.
+  STATIC   - Distribute work uniformly with no additional load balancing.
+  SIMPLE   - Recursively split its range until it cannot be further subdivided.
+
+See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
+for more details.
+
+endef
+$(info $(partitioner_help))
+PARTITIONER=AUTO
+endif
+
+PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)


 tbb-stream: main.cpp TBBStream.cpp
-	$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
+	$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@

 .PHONY: clean
 clean:
--- a/TBBStream.cpp
+++ b/TBBStream.cpp
@ -8,62 +8,26 @@

 template <class T>
 TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
- : partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+ : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
 {
-  std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
+  if(device != 0){
+    throw std::runtime_error("Device != 0 is not supported by TBB");
+  }
+  std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
 }

-template <class T>
-template <typename U, typename F>
-U TBBStream<T>::with_partitioner(const F &f) 
-{
-  switch(partitioner){
-    case Partitioner::Auto:      return f(tbb::auto_partitioner{});
-    case Partitioner::Affinity:  { tbb::affinity_partitioner p; return f(p); }  //  parallel_* doesn't take const affinity_partitioner here
-    case Partitioner::Static:    return f(tbb::static_partitioner{});
-    case Partitioner::Simple:    return f(tbb::simple_partitioner{});
-    default:                     throw std::runtime_error("Error asking for name for non-existant device");
-  }
-}
-
-template <class T>
-template <typename F>
-void TBBStream<T>::parallel_for(const F &f) 
-{
-  // using size_t as per the range type (also used in the official documentation)
-  with_partitioner<std::nullptr_t>([&](auto &&p) { 
-    tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
-      for (size_t i = r.begin(); i < r.end(); ++i) { 
-        f(i);
-      }
-    }, p);
-    return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
-  });
-}
-
-template <class T>
-template <typename F, typename Op>
-T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f) 
-{
-  return with_partitioner<T>([&](auto &&p) {
-    return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
-      for (size_t i = r.begin(); i < r.end(); ++i) { 
-        acc = op(acc, f(i));
-      }
-      return acc;
-    }, op, p);
-  });
-}

 template <class T>
 void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 {

-  parallel_for([&](size_t i){ 
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
      a[i] = initA;
      b[i] = initB;
      c[i] = initC;
-  });
+    }
+  }, partitioner);

 }

@ -79,7 +43,11 @@ void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
 template <class T>
 void TBBStream<T>::copy()
 {
-  parallel_for([&](size_t i){ c[i] = a[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i];
+    }
+  }, partitioner);
 }

 template <class T>
@ -87,7 +55,11 @@ void TBBStream<T>::mul()
 {
  const T scalar = startScalar;

-  parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       b[i] = scalar * c[i];
+    }
+  }, partitioner);

 }

@ -95,7 +67,11 @@ template <class T>
 void TBBStream<T>::add()
 {

-  parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       c[i] = a[i] + b[i];
+    }
+  }, partitioner);

 }

@ -104,7 +80,11 @@ void TBBStream<T>::triad()
 {
  const T scalar = startScalar;

-  parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] = b[i] + scalar * c[i];
+    }
+  }, partitioner);

 }

@ -113,7 +93,11 @@ void TBBStream<T>::nstream()
 {
  const T scalar = startScalar;

-  parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
+  tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
+    for (size_t i = r.begin(); i < r.end(); ++i) {
+       a[i] += b[i] + scalar * c[i];
+    }
+  }, partitioner);

 }

@ -121,29 +105,23 @@ template <class T>
 T TBBStream<T>::dot()
 {
  // sum += a[i] * b[i];
-  return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
+  return
+    tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
+      for (size_t i = r.begin(); i < r.end(); ++i) {
+        acc += a[i] * b[i];
+      }
+      return acc;
+    }, std::plus<T>(), partitioner);
 }

 void listDevices(void)
 {
-  std::cout 
-    << "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n" 
-    << "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n" 
-    << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" 
-    << std::endl;
+   std::cout << "Listing devices is not supported by TBB" << std::endl;
 }

 std::string getDeviceName(const int device)
 {
-  switch(static_cast<Partitioner>(device)){
-    case Partitioner::Auto:      return "auto_partitioner";
-    case Partitioner::Affinity:  return "affinity_partitioner";
-    case Partitioner::Static:    return "static_partitioner";
-    case Partitioner::Simple:    return "simple_partitioner";
-    default:                     throw std::runtime_error("Error asking for name for non-existant device");
-  }
+  return std::string("Device name unavailable");
 }

 std::string getDeviceDriver(const int)
--- a/TBBStream.hpp
+++ b/TBBStream.hpp
@ -13,31 +13,37 @@

 #define IMPLEMENTATION_STRING "TBB"

-enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
+#if defined(PARTITIONER_AUTO)
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#elif defined(PARTITIONER_AFFINITY)
+using tbb_partitioner = tbb::affinity_partitioner;
+#define PARTITIONER_NAME  "affinity_partitioner"
+#elif defined(PARTITIONER_STATIC)
+using tbb_partitioner = tbb::static_partitioner;
+#define PARTITIONER_NAME  "static_partitioner"
+#elif defined(PARTITIONER_SIMPLE)
+using tbb_partitioner = tbb::simple_partitioner;
+#define PARTITIONER_NAME  "simple_partitioner"
+#else
+// default to auto
+using tbb_partitioner = tbb::auto_partitioner;
+#define PARTITIONER_NAME  "auto_partitioner"
+#endif
+

 template <class T>
 class TBBStream : public Stream<T>
 {
  protected:
  
-
-    Partitioner partitioner;
+    tbb_partitioner partitioner;
    tbb::blocked_range<size_t> range;
    // Device side pointers
    std::vector<T> a;
    std::vector<T> b;
    std::vector<T> c;
 
-
-    template < typename U, typename F>
-    U with_partitioner(const F &f);
- 
-    template <typename F>
-    void parallel_for(const F &f);
-
-    template <typename F, typename Op>
-    T parallel_reduce(T init, const Op &op, const F &f);
-
  public:
    TBBStream(const int, int);
    ~TBBStream() = default;