Make partitioner a compile option
Inline all abstractions Add intel compilers for Make
This commit is contained in:
parent
0867115d8d
commit
0e3727d8f8
15
TBB.cmake
15
TBB.cmake
@ -4,15 +4,26 @@ register_flag_optional(ONE_TBB_DIR
|
||||
If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)."
|
||||
"")
|
||||
|
||||
|
||||
register_flag_optional(PARTITIONER
|
||||
"Partitioner specifies how a loop template should partition its work among threads.
|
||||
Possible values are:
|
||||
AUTO - Optimize range subdivision based on work-stealing events.
|
||||
AFFINITY - Proportional splitting that optimizes for cache affinity.
|
||||
STATIC - Distribute work uniformly with no additional load balancing.
|
||||
SIMPLE - Recursively split its range until it cannot be further subdivided.
|
||||
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details."
|
||||
"AUTO")
|
||||
|
||||
macro(setup)
|
||||
if(ONE_TBB_DIR)
|
||||
set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34
|
||||
# docs on Intel's website refers to TBB_DIR which hasn't been correct for 6 years
|
||||
# docs on Intel's website refers to TBB_DIR which is not correct
|
||||
endif()
|
||||
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14) # we use auto in lambda parameters for the different partitioners
|
||||
# see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages
|
||||
find_package(TBB REQUIRED)
|
||||
register_link_library(TBB::tbb)
|
||||
register_definitions(PARTITIONER_${PARTITIONER})
|
||||
endmacro()
|
||||
|
||||
40
TBB.make
40
TBB.make
@ -3,24 +3,52 @@ ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
GNU
|
||||
GNU INTEL INTEL_LEGACY
|
||||
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
TBB_LIB=
|
||||
|
||||
COMPILER_GNU = g++
|
||||
CXX_GNU = g++
|
||||
CXX_INTEL = icpx
|
||||
CXX_INTEL_LEGACY = icpc
|
||||
CXX = $(COMPILER_$(COMPILER))
|
||||
|
||||
FLAGS_GNU = -O3 -std=c++14 -march=native
|
||||
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||
CXXFLAGS_GNU = -march=native
|
||||
CXXFLAGS_INTEL = -march=native
|
||||
CXXFLAGS_INTEL_LEGACY = -qopt-streaming-stores=always
|
||||
|
||||
CXX = $(CXX_$(COMPILER))
|
||||
CXXFLAGS = -std=c++11 -O3 $(CXXFLAGS_$(COMPILER))
|
||||
|
||||
|
||||
|
||||
ifndef PARTITIONER
|
||||
define partitioner_help
|
||||
Set PARTITIONER to select TBB's partitioner.
|
||||
Partitioner specifies how a loop template should partition its work among threads.
|
||||
|
||||
Available options:
|
||||
AUTO - Optimize range subdivision based on work-stealing events.
|
||||
AFFINITY - Proportional splitting that optimizes for cache affinity.
|
||||
STATIC - Distribute work uniformly with no additional load balancing.
|
||||
SIMPLE - Recursively split its range until it cannot be further subdivided.
|
||||
|
||||
See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners
|
||||
for more details.
|
||||
|
||||
endef
|
||||
$(info $(partitioner_help))
|
||||
PARTITIONER=AUTO
|
||||
endif
|
||||
|
||||
PARTITIONER_MODE = -DPARTITIONER_$(PARTITIONER)
|
||||
|
||||
|
||||
tbb-stream: main.cpp TBBStream.cpp
|
||||
$(CXX) -DTBB $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
|
||||
$(CXX) -DTBB $(PARTITIONER_MODE) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -I$(TBB_DIR)/include -Wl,-rpath,$(TBB_DIR)/lib/intel64/gcc4.8 $(TBB_DIR)/lib/intel64/gcc4.8/libtbb.so -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
|
||||
108
TBBStream.cpp
108
TBBStream.cpp
@ -8,62 +8,26 @@
|
||||
|
||||
template <class T>
|
||||
TBBStream<T>::TBBStream(const int ARRAY_SIZE, int device)
|
||||
: partitioner(static_cast<Partitioner>(device)), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||
: partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
|
||||
{
|
||||
std::cout << "Using TBB partitioner: " << getDeviceName(device) << std::endl;
|
||||
if(device != 0){
|
||||
throw std::runtime_error("Device != 0 is not supported by TBB");
|
||||
}
|
||||
std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename U, typename F>
|
||||
U TBBStream<T>::with_partitioner(const F &f)
|
||||
{
|
||||
switch(partitioner){
|
||||
case Partitioner::Auto: return f(tbb::auto_partitioner{});
|
||||
case Partitioner::Affinity: { tbb::affinity_partitioner p; return f(p); } // parallel_* doesn't take const affinity_partitioner here
|
||||
case Partitioner::Static: return f(tbb::static_partitioner{});
|
||||
case Partitioner::Simple: return f(tbb::simple_partitioner{});
|
||||
default: throw std::runtime_error("Error asking for name for non-existant device");
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename F>
|
||||
void TBBStream<T>::parallel_for(const F &f)
|
||||
{
|
||||
// using size_t as per the range type (also used in the official documentation)
|
||||
with_partitioner<std::nullptr_t>([&](auto &&p) {
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
f(i);
|
||||
}
|
||||
}, p);
|
||||
return nullptr; // what we really want here is std::monostate, but we don't want to be C++17 only so nullptr_t it is
|
||||
});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename F, typename Op>
|
||||
T TBBStream<T>::parallel_reduce(T init, const Op &op, const F &f)
|
||||
{
|
||||
return with_partitioner<T>([&](auto &&p) {
|
||||
return tbb::parallel_reduce(range, init, [&](const tbb::blocked_range<size_t>& r, T acc) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
acc = op(acc, f(i));
|
||||
}
|
||||
return acc;
|
||||
}, op, p);
|
||||
});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void TBBStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
|
||||
parallel_for([&](size_t i){
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
});
|
||||
}
|
||||
}, partitioner);
|
||||
|
||||
}
|
||||
|
||||
@ -79,7 +43,11 @@ void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
|
||||
template <class T>
|
||||
void TBBStream<T>::copy()
|
||||
{
|
||||
parallel_for([&](size_t i){ c[i] = a[i]; });
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
c[i] = a[i];
|
||||
}
|
||||
}, partitioner);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
@ -87,7 +55,11 @@ void TBBStream<T>::mul()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
|
||||
parallel_for([&](size_t i){ b[i] = scalar * c[i]; });
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
b[i] = scalar * c[i];
|
||||
}
|
||||
}, partitioner);
|
||||
|
||||
}
|
||||
|
||||
@ -95,7 +67,11 @@ template <class T>
|
||||
void TBBStream<T>::add()
|
||||
{
|
||||
|
||||
parallel_for([&](size_t i){ c[i] = a[i] + b[i]; });
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
}, partitioner);
|
||||
|
||||
}
|
||||
|
||||
@ -104,7 +80,11 @@ void TBBStream<T>::triad()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
|
||||
parallel_for([&](size_t i){ a[i] = b[i] + scalar * c[i]; });
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
}
|
||||
}, partitioner);
|
||||
|
||||
}
|
||||
|
||||
@ -113,7 +93,11 @@ void TBBStream<T>::nstream()
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
|
||||
parallel_for([&](size_t i){ a[i] += b[i] + scalar * c[i]; });
|
||||
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
a[i] += b[i] + scalar * c[i];
|
||||
}
|
||||
}, partitioner);
|
||||
|
||||
}
|
||||
|
||||
@ -121,29 +105,23 @@ template <class T>
|
||||
T TBBStream<T>::dot()
|
||||
{
|
||||
// sum += a[i] * b[i];
|
||||
return parallel_reduce(0.0, std::plus<T>(), [&](size_t i) { return a[i] * b[i]; });
|
||||
return
|
||||
tbb::parallel_reduce(range, T{}, [&](const tbb::blocked_range<size_t>& r, T acc) {
|
||||
for (size_t i = r.begin(); i < r.end(); ++i) {
|
||||
acc += a[i] * b[i];
|
||||
}
|
||||
return acc;
|
||||
}, std::plus<T>(), partitioner);
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
std::cout
|
||||
<< "[" << static_cast<int>(Partitioner::Auto) << "] auto partitioner\n"
|
||||
<< "[" << static_cast<int>(Partitioner::Affinity) << "] affinity partitioner\n"
|
||||
<< "[" << static_cast<int>(Partitioner::Static) << "] static partitioner\n"
|
||||
<< "[" << static_cast<int>(Partitioner::Simple) << "] simple partitioner\n"
|
||||
<< "See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details"
|
||||
<< std::endl;
|
||||
std::cout << "Listing devices is not supported by TBB" << std::endl;
|
||||
}
|
||||
|
||||
std::string getDeviceName(const int device)
|
||||
{
|
||||
switch(static_cast<Partitioner>(device)){
|
||||
case Partitioner::Auto: return "auto_partitioner";
|
||||
case Partitioner::Affinity: return "affinity_partitioner";
|
||||
case Partitioner::Static: return "static_partitioner";
|
||||
case Partitioner::Simple: return "simple_partitioner";
|
||||
default: throw std::runtime_error("Error asking for name for non-existant device");
|
||||
}
|
||||
return std::string("Device name unavailable");
|
||||
}
|
||||
|
||||
std::string getDeviceDriver(const int)
|
||||
|
||||
@ -13,31 +13,37 @@
|
||||
|
||||
#define IMPLEMENTATION_STRING "TBB"
|
||||
|
||||
enum class Partitioner : int { Auto = 0, Affinity, Static, Simple};
|
||||
#if defined(PARTITIONER_AUTO)
|
||||
using tbb_partitioner = tbb::auto_partitioner;
|
||||
#define PARTITIONER_NAME "auto_partitioner"
|
||||
#elif defined(PARTITIONER_AFFINITY)
|
||||
using tbb_partitioner = tbb::affinity_partitioner;
|
||||
#define PARTITIONER_NAME "affinity_partitioner"
|
||||
#elif defined(PARTITIONER_STATIC)
|
||||
using tbb_partitioner = tbb::static_partitioner;
|
||||
#define PARTITIONER_NAME "static_partitioner"
|
||||
#elif defined(PARTITIONER_SIMPLE)
|
||||
using tbb_partitioner = tbb::simple_partitioner;
|
||||
#define PARTITIONER_NAME "simple_partitioner"
|
||||
#else
|
||||
// default to auto
|
||||
using tbb_partitioner = tbb::auto_partitioner;
|
||||
#define PARTITIONER_NAME "auto_partitioner"
|
||||
#endif
|
||||
|
||||
|
||||
template <class T>
|
||||
class TBBStream : public Stream<T>
|
||||
{
|
||||
protected:
|
||||
|
||||
|
||||
Partitioner partitioner;
|
||||
tbb_partitioner partitioner;
|
||||
tbb::blocked_range<size_t> range;
|
||||
// Device side pointers
|
||||
std::vector<T> a;
|
||||
std::vector<T> b;
|
||||
std::vector<T> c;
|
||||
|
||||
|
||||
template < typename U, typename F>
|
||||
U with_partitioner(const F &f);
|
||||
|
||||
template <typename F>
|
||||
void parallel_for(const F &f);
|
||||
|
||||
template <typename F, typename Op>
|
||||
T parallel_reduce(T init, const Op &op, const F &f);
|
||||
|
||||
public:
|
||||
TBBStream(const int, int);
|
||||
~TBBStream() = default;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user