Merge branch 'main' into sycl-2020

Conflicts:
	SYCLStream.h
This commit is contained in:
Tom Deakin 2021-02-22 15:18:34 +00:00
commit 5490253342
23 changed files with 251 additions and 51 deletions

View File

@ -120,6 +120,22 @@ void ACCStream<T>::triad()
} }
} }
template <class T>
void ACCStream<T>::nstream()
{
const T scalar = startScalar;
int array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
a[i] += b[i] + scalar * c[i];
}
}
template <class T> template <class T>
T ACCStream<T>::dot() T ACCStream<T>::dot()
{ {

View File

@ -35,6 +35,7 @@ class ACCStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file.
- Kokkos 3 build system (No code changes made). - Kokkos 3 build system (No code changes made).
- SYCL build rules for ComputeCpp, DPCPP and HipSYCL. - SYCL build rules for ComputeCpp, DPCPP and HipSYCL.
- Support for CUDA Managed Memory and Page Fault memory. - Support for CUDA Managed Memory and Page Fault memory.
- Added nstream kernel from PRK with associate command line option.
### Changed ### Changed
- Default branch renamed from `master` to `main`. - Default branch renamed from `master` to `main`.
@ -25,6 +26,7 @@ All notable changes to this project will be documented in this file.
- Reorder OpenCL objects in class so destructors are called in safe order. - Reorder OpenCL objects in class so destructors are called in safe order.
- Ensure all OpenCL kernels are present in destructor. - Ensure all OpenCL kernels are present in destructor.
- Unified run function in driver code to reduce code duplication, output should be uneffected. - Unified run function in driver code to reduce code duplication, output should be uneffected.
- Normalise sum result by expected value to help false negative errors.
### Removed ### Removed
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.

View File

@ -212,6 +212,23 @@ void CUDAStream<T>::triad()
check_error(); check_error();
} }
template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void CUDAStream<T>::nstream()
{
nstream_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <class T> template <class T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{ {

View File

@ -50,6 +50,7 @@ class CUDAStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -182,6 +182,23 @@ void HIPStream<T>::triad()
check_error(); check_error();
} }
template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::nstream()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <class T> template <class T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{ {

View File

@ -41,6 +41,7 @@ class HIPStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -119,6 +119,21 @@ void KokkosStream<T>::triad()
Kokkos::fence(); Kokkos::fence();
} }
template <class T>
void KokkosStream<T>::nstream()
{
Kokkos::View<T*> a(*d_a);
Kokkos::View<T*> b(*d_b);
Kokkos::View<T*> c(*d_c);
const T scalar = startScalar;
Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] += b[index] + scalar*c[index];
});
Kokkos::fence();
}
template <class T> template <class T>
T KokkosStream<T>::dot() T KokkosStream<T>::dot()
{ {

View File

@ -41,6 +41,7 @@ class KokkosStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -61,6 +61,14 @@ std::string kernels{R"CLC(
const size_t i = get_global_id(0); const size_t i = get_global_id(0);
a[i] = b[i] + scalar * c[i]; a[i] = b[i] + scalar * c[i];
} }
kernel void nstream(
global TYPE * restrict a,
global const TYPE * restrict b,
global const TYPE * restrict c)
{
const size_t i = get_global_id(0);
a[i] += b[i] + scalar * c[i];
}
kernel void stream_dot( kernel void stream_dot(
global const TYPE * restrict a, global const TYPE * restrict a,
@ -157,6 +165,7 @@ OCLStream<T>::OCLStream(const int ARRAY_SIZE, const int device_index)
mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul"); mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add"); add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad"); triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
nstream_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "nstream");
dot_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int>(program, "stream_dot"); dot_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int>(program, "stream_dot");
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
@ -186,6 +195,7 @@ OCLStream<T>::~OCLStream()
delete mul_kernel; delete mul_kernel;
delete add_kernel; delete add_kernel;
delete triad_kernel; delete triad_kernel;
delete nstream_kernel;
delete dot_kernel; delete dot_kernel;
devices.clear(); devices.clear();
@ -231,6 +241,16 @@ void OCLStream<T>::triad()
queue.finish(); queue.finish();
} }
template <class T>
void OCLStream<T>::nstream()
{
(*nstream_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_a, d_b, d_c
);
queue.finish();
}
template <class T> template <class T>
T OCLStream<T>::dot() T OCLStream<T>::dot()
{ {

View File

@ -47,6 +47,7 @@ class OCLStream : public Stream<T>
cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *nstream_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int> *dot_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int> *dot_kernel;
// NDRange configuration for the dot kernel // NDRange configuration for the dot kernel
@ -62,6 +63,7 @@ class OCLStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -191,6 +191,31 @@ void OMPStream<T>::triad()
#endif #endif
} }
template <class T>
void OMPStream<T>::nstream()
{
const T scalar = startScalar;
#ifdef OMP_TARGET_GPU
int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
for (int i = 0; i < array_size; i++)
{
a[i] += b[i] + scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T> template <class T>
T OMPStream<T>::dot() T OMPStream<T>::dot()
{ {

View File

@ -36,6 +36,7 @@ class OMPStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -38,6 +38,15 @@ But this information is not typically available in real HPC codes today, where t
BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice. BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice.
BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels).
Details about PRK can be found in the following references:
> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
Website Website
------- -------

View File

@ -94,6 +94,20 @@ void STD20Stream<T>::triad()
); );
} }
template <class T>
void STD20Stream<T>::nstream()
{
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] += b[i] + scalar * c[i];
}
);
}
template <class T> template <class T>
T STD20Stream<T>::dot() T STD20Stream<T>::dot()
{ {

View File

@ -33,6 +33,7 @@ class STD20Stream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -72,6 +72,17 @@ void STDStream<T>::triad()
std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; }); std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
} }
template <class T>
void STDStream<T>::nstream()
{
// a[i] += b[i] + scalar * c[i];
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, a, a+array_size, b, a, [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a, a+array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; });
}
template <class T> template <class T>
T STDStream<T>::dot() T STDStream<T>::dot()
{ {

View File

@ -31,6 +31,7 @@ class STDStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -33,66 +33,49 @@ Set ARCH to change device (defaulting to "").
endef endef
ifneq ($(COMPILER), DPCPP) ifeq ($(COMPILER), HIPSYCL)
ifneq ($(TARGET), CPU)
$(info $(arch_help)) $(info $(arch_help))
ARCH= ARCH=
endif
endif endif
endif endif
SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) -no-serial-memop -sycl-driver
SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS)
SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS) SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS)
SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS)
SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64 SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64
SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++ SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++
SYCL_COMPUTECPP_FLAGS = -O3 --std=c++17 SYCL_COMPUTECPP_FLAGS = -O3 -std=c++17
SYCL_COMPUTECPP_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -lComputeCpp -lOpenCL -Wl,--rpath=$(SYCL_SDK_DIR)/lib/ SYCL_COMPUTECPP_LINK_FLAGS = -Wl,-rpath=$(SYCL_SDK_DIR)/lib/ $(SYCL_SDK_DIR)/lib/libComputeCpp.so -lOpenCL
SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include
SYCL_COMPUTECPP_CXX = g++
SYCL_COMPUTECPP_DEPS = SYCLStream.sycl
SYCL_HIPSYCL_SYCLFLAGS_CPU = -O3 --std=c++17 --hipsycl-platform=cpu SYCL_HIPSYCL_SYCLFLAGS_CPU = --hipsycl-platform=cpu
SYCL_HIPSYCL_SYCLFLAGS_AMD = -O3 --std=c++17 --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH) SYCL_HIPSYCL_SYCLFLAGS_AMD = --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH)
SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = -O3 --std=c++17 --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH) SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH)
SYCL_HIPSYCL_SYCLCXX = $(SYCL_SDK_DIR)/bin/syclcc SYCL_HIPSYCL_SYCLCXX = $(SYCL_SDK_DIR)/bin/syclcc
SYCL_HIPSYCL_FLAGS = $(SYCL_HIPSYCL_SYCLFLAGS_$(TARGET)) SYCL_HIPSYCL_FLAGS = -O3 --std=c++17
SYCL_HIPSYCL_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -Wl,-rpath,$(SYCL_SDK_DIR)/lib SYCL_HIPSYCL_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -Wl,-rpath,$(SYCL_SDK_DIR)/lib
SYCL_HIPSYCL_INCLUDE = SYCL_HIPSYCL_INCLUDE =
SYCL_HIPSYCL_CXX = $(SYCL_HIPSYCL_SYCLCXX)
SYCL_HIPSYCL_DEPS =
SYCL_DPCPP_SYCLFLAGS_CPU = -O3 --std=c++17 SYCL_DPCPP_SYCLFLAGS_NVIDIA = -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
SYCL_DPCPP_SYCLFLAGS_NVIDIA = -O3 --std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
SYCL_DPCPP_SYCLCXX = dpcpp SYCL_DPCPP_SYCLCXX = dpcpp
SYCL_DPCPP_FLAGS = $(SYCL_DPCPP_SYCLFLAGS_CPU) SYCL_DPCPP_FLAGS = -O3 --std=c++17
SYCL_DPCPP_LINK_FLAGS = SYCL_DPCPP_LINK_FLAGS =
SYCL_DPCPP_INCLUDE = SYCL_DPCPP_INCLUDE =
SYCL_DPCPP_CXX = dpcpp
SYCL_DPCPP_DEPS =
SYCL_SYCLFLAGS = $(SYCL_$(COMPILER)_SYCLFLAGS_$(TARGET)) SYCL_SYCLFLAGS = $(SYCL_$(COMPILER)_SYCLFLAGS_$(TARGET))
SYCL_SYCLCXX = $(SYCL_$(COMPILER)_SYCLCXX) SYCL_SYCLCXX = $(SYCL_$(COMPILER)_SYCLCXX)
SYCL_FLAGS = $(SYCL_$(COMPILER)_FLAGS) SYCL_FLAGS = $(SYCL_$(COMPILER)_FLAGS)
SYCL_LINK_FLAGS = $(SYCL_$(COMPILER)_LINK_FLAGS) SYCL_LINK_FLAGS = $(SYCL_$(COMPILER)_LINK_FLAGS)
SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE) SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE)
SYCL_CXX = $(SYCL_$(COMPILER)_CXX)
SYCL_DEPS = $(SYCL_$(COMPILER)_DEPS)
sycl-stream: main.o SYCLStream.o $(SYCL_DEPS) # only ComputeCpp generates .sycl files which is a bit odd to deal with so we opted to compile everything together
$(SYCL_CXX) $(SYCL_FLAGS) -DSYCL main.o SYCLStream.o $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) -o $@ sycl-stream: main.cpp SYCLStream.cpp
$(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@
main.o: main.cpp
$(SYCL_CXX) $(SYCL_FLAGS) -DSYCL main.cpp -c $(SYCL_INCLUDE) $(EXTRA_FLAGS) -o $@
SYCLStream.o: SYCLStream.cpp $(SYCL_DEPS)
$(SYCL_CXX) $(SYCL_FLAGS) -DSYCL SYCLStream.cpp -c $(SYCL_INCLUDE) $(EXTRA_FLAGS) -o $@
SYCLStream.sycl: SYCLStream.cpp
$(SYCL_SYCLCXX) -DSYCL SYCLStream.cpp $(SYCL_SYCLFLAGS) -c $(SYCL_INCLUDE) -o $@
.PHONY: clean .PHONY: clean
clean: clean:
rm -f sycl-stream SYCLStream.sycl main.o SYCLStream.o rm -f sycl-stream

View File

@ -127,6 +127,23 @@ void SYCLStream<T>::triad()
queue->wait(); queue->wait();
} }
template <class T>
void SYCLStream<T>::nstream()
{
const T scalar = startScalar;
queue->submit([&](handler &cgh)
{
auto ka = d_a->template get_access<access::mode::read_write>(cgh);
auto kb = d_b->template get_access<access::mode::read>(cgh);
auto kc = d_c->template get_access<access::mode::read>(cgh);
cgh.parallel_for<nstream_kernel>(range<1>{array_size}, [=](id<1> idx)
{
ka[idx] += kb[idx] + scalar * kc[idx];
});
});
queue->wait();
}
template <class T> template <class T>
T SYCLStream<T>::dot() T SYCLStream<T>::dot()
{ {

View File

@ -42,6 +42,7 @@ class SYCLStream : public Stream<T>
virtual void add() override; virtual void add() override;
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void nstream() override;
virtual T dot() override; virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override; virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -29,6 +29,7 @@ class Stream
virtual void mul() = 0; virtual void mul() = 0;
virtual void add() = 0; virtual void add() = 0;
virtual void triad() = 0; virtual void triad() = 0;
virtual void nstream() = 0;
virtual T dot() = 0; virtual T dot() = 0;
// Copy memory between host and device // Copy memory between host and device

View File

@ -58,13 +58,11 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
template <typename T> template <typename T>
void run(); void run();
template <typename T>
void run_triad();
// Options for running the benchmark: // Options for running the benchmark:
// - All 5 kernels (Copy, Add, Mul, Triad, Dot). // - All 5 kernels (Copy, Add, Mul, Triad, Dot).
// - Triad only. // - Triad only.
enum class Benchmark {All, Triad}; // - Nstream only.
enum class Benchmark {All, Triad, Nstream};
// Selected run options. // Selected run options.
Benchmark selection = Benchmark::All; Benchmark selection = Benchmark::All;
@ -166,6 +164,27 @@ std::vector<std::vector<double>> run_triad(Stream<T> *stream)
return timings; return timings;
} }
// Run the Nstream kernel
template <typename T>
std::vector<std::vector<double>> run_nstream(Stream<T> *stream)
{
std::vector<std::vector<double>> timings(1);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Run nstream in loop
for (int k = 0; k < num_times; k++) {
t1 = std::chrono::high_resolution_clock::now();
stream->nstream();
t2 = std::chrono::high_resolution_clock::now();
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
return timings;
}
// Generic run routine // Generic run routine
// Runs the kernel(s) and prints output. // Runs the kernel(s) and prints output.
@ -275,6 +294,10 @@ void run()
break; break;
case Benchmark::Triad: case Benchmark::Triad:
timings = run_triad<T>(stream); timings = run_triad<T>(stream);
break;
case Benchmark::Nstream:
timings = run_nstream<T>(stream);
break;
}; };
// Check solutions // Check solutions
@ -313,17 +336,26 @@ void run()
} }
if (selection == Benchmark::All) if (selection == Benchmark::All || selection == Benchmark::Nstream)
{ {
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; std::vector<std::string> labels;
size_t sizes[5] = { std::vector<size_t> sizes;
if (selection == Benchmark::All)
{
labels = {"Copy", "Mul", "Add", "Triad", "Dot"};
sizes = {
2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE 2 * sizeof(T) * ARRAY_SIZE};
}; } else if (selection == Benchmark::Nstream)
{
labels = {"Nstream"};
sizes = {4 * sizeof(T) * ARRAY_SIZE };
}
for (int i = 0; i < timings.size(); ++i) for (int i = 0; i < timings.size(); ++i)
{ {
@ -416,13 +448,19 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
for (unsigned int i = 0; i < ntimes; i++) for (unsigned int i = 0; i < ntimes; i++)
{ {
// Do STREAM! // Do STREAM!
if (! (selection == Benchmark::Triad)) if (selection == Benchmark::All)
{ {
goldC = goldA; goldC = goldA;
goldB = scalar * goldC; goldB = scalar * goldC;
goldC = goldA + goldB; goldC = goldA + goldB;
}
goldA = goldB + scalar * goldC; goldA = goldB + scalar * goldC;
} else if (selection == Benchmark::Triad)
{
goldA = goldB + scalar * goldC;
} else if (selection == Benchmark::Nstream)
{
goldA += goldB + scalar * goldC;
}
} }
// Do the reduction // Do the reduction
@ -435,7 +473,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
errB /= b.size(); errB /= b.size();
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
errC /= c.size(); errC /= c.size();
double errSum = fabs(sum - goldSum); double errSum = fabs((sum - goldSum)/goldSum);
double epsi = std::numeric_limits<T>::epsilon() * 100.0; double epsi = std::numeric_limits<T>::epsilon() * 100.0;
@ -452,7 +490,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
<< "Validation failed on c[]. Average error " << errC << "Validation failed on c[]. Average error " << errC
<< std::endl; << std::endl;
// Check sum to 8 decimal places // Check sum to 8 decimal places
if (!(selection == Benchmark::Triad) && errSum > 1.0E-8) if (selection == Benchmark::All && errSum > 1.0E-8)
std::cerr std::cerr
<< "Validation failed on sum. Error " << errSum << "Validation failed on sum. Error " << errSum
<< std::endl << std::setprecision(15) << std::endl << std::setprecision(15)
@ -523,6 +561,10 @@ void parseArguments(int argc, char *argv[])
{ {
selection = Benchmark::Triad; selection = Benchmark::Triad;
} }
else if (!std::string("--nstream-only").compare(argv[i]))
{
selection = Benchmark::Nstream;
}
else if (!std::string("--csv").compare(argv[i])) else if (!std::string("--csv").compare(argv[i]))
{ {
output_as_csv = true; output_as_csv = true;
@ -544,6 +586,7 @@ void parseArguments(int argc, char *argv[])
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl;
std::cout << " --triad-only Only run triad" << std::endl; std::cout << " --triad-only Only run triad" << std::endl;
std::cout << " --nstream-only Only run nstream" << std::endl;
std::cout << " --csv Output as csv table" << std::endl; std::cout << " --csv Output as csv table" << std::endl;
std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;
std::cout << std::endl; std::cout << std::endl;