diff --git a/TBB.cmake b/TBB.cmake index 99e31f7..e4d6bac 100644 --- a/TBB.cmake +++ b/TBB.cmake @@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") + +register_flag_optional(PARTITIONER + "Partitioner specifies how a loop template should partition its work among threads. + Possible values are: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." + "AUTO") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 - # docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years + # docs on Intel's website refers to TBB_DIR which is not correct endif() - set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) + register_definitions(PARTITIONER_${PARTITIONER}) endmacro() diff --git a/TBB.make b/TBB.make index e3b5c86..c224a5a 100644 --- a/TBB.make +++ b/TBB.make @@ -3,24 +3,52 @@ ifndef COMPILER define compiler_help Set COMPILER to change flags (defaulting to GNU). Available compilers are: - GNU + GNU INTEL INTEL_LEGACY endef $(info $(compiler_help)) COMPILER=GNU endif -TBB_LIB= -COMPILER_GNU = g++ +CXX_GNU = g++ +CXX_INTEL = icpx +CXX_INTEL_LEGACY = icpc CXX = $(COMPILER_$(COMPILER)) -FLAGS_GNU = -O3 -std=c++14 -march=native -CXXFLAGS = $(FLAGS_$(COMPILER)) +CXXFLAGS_GNU = -march=native +CXXFLAGS_INTEL = -march=native +CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER)) + + + +ifndef PARTITIONER +define partitioner_help +Set PARTITIONER to select TBB's partitioner. +Partitioner specifies how a loop template should partition its work among threads. + +Available options: + AUTO - Optimize range subdivision based on work-stealing events. + AFFINITY - Proportional splitting that optimizes for cache affinity. + STATIC - Distribute work uniformly with no additional load balancing. + SIMPLE - Recursively split its range until it cannot be further subdivided. + +See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners +for more details. + +endef +$(info $(partitioner_help)) +PARTITIONER=AUTO +endif + +PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER) tbb-stream: main.cpp TBBStream.cpp - $(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ + $(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@ .PHONY: clean clean: diff --git a/TBBStream.cpp b/TBBStream.cpp index 08b83c8..9c34a50 100644 --- a/TBBStream.cpp +++ b/TBBStream.cpp @@ -8,62 +8,26 @@ template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(static_cast(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) { - std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl; -} - -template -template -U TBBStream::with_partitioner(const F &f) -{ - switch(partitioner){ - case Partitioner::Auto: return f(tbb::auto_partitioner{}); - case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here - case Partitioner::Static: return f(tbb::static_partitioner{}); - case Partitioner::Simple: return f(tbb::simple_partitioner{}); - default: throw std::runtime_error("Error asking for name for non-existant device"); + if(device != 0){ + throw std::runtime_error("Device != 0 is not supported by TBB"); } + std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; } -template -template -void TBBStream::parallel_for(const F &f) -{ - // using size_t as per the range type (also used in the official documentation) - with_partitioner([&](auto &&p) { - tbb::parallel_for(range, [&](const tbb::blocked_range& r) { - for (size_t i = r.begin(); i < r.end(); ++i) { - f(i); - } - }, p); - return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is - }); -} - -template -template -T TBBStream::parallel_reduce(T init, const Op &op, const F &f) -{ - return with_partitioner([&](auto &&p) { - return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range& r, T acc) { - for (size_t i = r.begin(); i < r.end(); ++i) { - acc = op(acc, f(i)); - } - return acc; - }, op, p); - }); -} template void TBBStream::init_arrays(T initA, T initB, T initC) { - parallel_for([&](size_t i){ - a[i] = initA; - b[i] = initB; - c[i] = initC; - }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + }, partitioner); } @@ -79,23 +43,35 @@ void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve template void TBBStream::copy() { - parallel_for([&](size_t i){ c[i] = a[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i]; + } + }, partitioner); } template void TBBStream::mul() { const T scalar = startScalar; - - parallel_for([&](size_t i){ b[i] = scalar * c[i]; }); - + + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + b[i] = scalar * c[i]; + } + }, partitioner); + } template void TBBStream::add() { - parallel_for([&](size_t i){ c[i] = a[i] + b[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + c[i] = a[i] + b[i]; + } + }, partitioner); } @@ -104,7 +80,11 @@ void TBBStream::triad() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] = b[i] + scalar * c[i]; + } + }, partitioner); } @@ -113,7 +93,11 @@ void TBBStream::nstream() { const T scalar = startScalar; - parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; }); + tbb::parallel_for(range, [&](const tbb::blocked_range& r) { + for (size_t i = r.begin(); i < r.end(); ++i) { + a[i] += b[i] + scalar * c[i]; + } + }, partitioner); } @@ -121,29 +105,23 @@ template T TBBStream::dot() { // sum += a[i] * b[i]; - return parallel_reduce(0.0, std::plus(), [&](size_t i) { return a[i] * b[i]; }); + return + tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range& r, T acc) { + for (size_t i = r.begin(); i < r.end(); ++i) { + acc += a[i] * b[i]; + } + return acc; + }, std::plus(), partitioner); } void listDevices(void) { - std::cout - << "[" << static_cast(Partitioner::Auto) << "] auto partitioner\n" - << "[" << static_cast(Partitioner::Affinity) << "] affinity partitioner\n" - << "[" << static_cast(Partitioner::Static) << "] static partitioner\n" - << "[" << static_cast(Partitioner::Simple) << "] simple partitioner\n" - << "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details" - << std::endl; + std::cout << "Listing devices is not supported by TBB" << std::endl; } std::string getDeviceName(const int device) { - switch(static_cast(device)){ - case Partitioner::Auto: return "auto_partitioner"; - case Partitioner::Affinity: return "affinity_partitioner"; - case Partitioner::Static: return "static_partitioner"; - case Partitioner::Simple: return "simple_partitioner"; - default: throw std::runtime_error("Error asking for name for non-existant device"); - } + return std::string("Device name unavailable"); } std::string getDeviceDriver(const int) diff --git a/TBBStream.hpp b/TBBStream.hpp index 6ba9741..90763a9 100644 --- a/TBBStream.hpp +++ b/TBBStream.hpp @@ -13,31 +13,37 @@ #define IMPLEMENTATION_STRING "TBB" -enum class Partitioner : int { Auto = 0, Affinity, Static, Simple}; +#if defined(PARTITIONER_AUTO) +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#elif defined(PARTITIONER_AFFINITY) +using tbb_partitioner = tbb::affinity_partitioner; +#define PARTITIONER_NAME "affinity_partitioner" +#elif defined(PARTITIONER_STATIC) +using tbb_partitioner = tbb::static_partitioner; +#define PARTITIONER_NAME "static_partitioner" +#elif defined(PARTITIONER_SIMPLE) +using tbb_partitioner = tbb::simple_partitioner; +#define PARTITIONER_NAME "simple_partitioner" +#else +// default to auto +using tbb_partitioner = tbb::auto_partitioner; +#define PARTITIONER_NAME "auto_partitioner" +#endif + template class TBBStream : public Stream { protected: - - Partitioner partitioner; + tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers std::vector a; std::vector b; std::vector c; - - - template < typename U, typename F> - U with_partitioner(const F &f); - template - void parallel_for(const F &f); - - template - T parallel_reduce(T init, const Op &op, const F &f); - public: TBBStream(const int, int); ~TBBStream() = default;