Merge pull request #88 from UoB-HPC/nstream

Add PRK Nstream kernel
This commit is contained in:
Tom Deakin 2021-02-22 15:14:21 +00:00 committed by GitHub
commit 9025afec1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 236 additions and 17 deletions

View File

@ -120,6 +120,22 @@ void ACCStream<T>::triad()
}
}
template <class T>
void ACCStream<T>::nstream()
{
const T scalar = startScalar;
int array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
a[i] += b[i] + scalar * c[i];
}
}
template <class T>
T ACCStream<T>::dot()
{

View File

@ -35,6 +35,7 @@ class ACCStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file.
- Kokkos 3 build system (No code changes made).
- SYCL build rules for ComputeCpp, DPCPP and HipSYCL.
- Support for CUDA Managed Memory and Page Fault memory.
- Added nstream kernel from PRK with associate command line option.
### Changed
- Default branch renamed from `master` to `main`.
@ -25,6 +26,7 @@ All notable changes to this project will be documented in this file.
- Reorder OpenCL objects in class so destructors are called in safe order.
- Ensure all OpenCL kernels are present in destructor.
- Unified run function in driver code to reduce code duplication, output should be uneffected.
- Normalise sum result by expected value to help false negative errors.
### Removed
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.

View File

@ -212,6 +212,23 @@ void CUDAStream<T>::triad()
check_error();
}
template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void CUDAStream<T>::nstream()
{
nstream_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <class T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{

View File

@ -50,6 +50,7 @@ class CUDAStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -182,6 +182,23 @@ void HIPStream<T>::triad()
check_error();
}
template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::nstream()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <class T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{

View File

@ -41,6 +41,7 @@ class HIPStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -119,6 +119,21 @@ void KokkosStream<T>::triad()
Kokkos::fence();
}
template <class T>
void KokkosStream<T>::nstream()
{
Kokkos::View<T*> a(*d_a);
Kokkos::View<T*> b(*d_b);
Kokkos::View<T*> c(*d_c);
const T scalar = startScalar;
Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] += b[index] + scalar*c[index];
});
Kokkos::fence();
}
template <class T>
T KokkosStream<T>::dot()
{

View File

@ -41,6 +41,7 @@ class KokkosStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -61,6 +61,14 @@ std::string kernels{R"CLC(
const size_t i = get_global_id(0);
a[i] = b[i] + scalar * c[i];
}
kernel void nstream(
global TYPE * restrict a,
global const TYPE * restrict b,
global const TYPE * restrict c)
{
const size_t i = get_global_id(0);
a[i] += b[i] + scalar * c[i];
}
kernel void stream_dot(
global const TYPE * restrict a,
@ -157,6 +165,7 @@ OCLStream<T>::OCLStream(const int ARRAY_SIZE, const int device_index)
mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
nstream_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "nstream");
dot_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int>(program, "stream_dot");
array_size = ARRAY_SIZE;
@ -186,6 +195,7 @@ OCLStream<T>::~OCLStream()
delete mul_kernel;
delete add_kernel;
delete triad_kernel;
delete nstream_kernel;
delete dot_kernel;
devices.clear();
@ -231,6 +241,16 @@ void OCLStream<T>::triad()
queue.finish();
}
template <class T>
void OCLStream<T>::nstream()
{
(*nstream_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_a, d_b, d_c
);
queue.finish();
}
template <class T>
T OCLStream<T>::dot()
{

View File

@ -47,6 +47,7 @@ class OCLStream : public Stream<T>
cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *nstream_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl_int> *dot_kernel;
// NDRange configuration for the dot kernel
@ -62,6 +63,7 @@ class OCLStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -191,6 +191,31 @@ void OMPStream<T>::triad()
#endif
}
template <class T>
void OMPStream<T>::nstream()
{
const T scalar = startScalar;
#ifdef OMP_TARGET_GPU
int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
for (int i = 0; i < array_size; i++)
{
a[i] += b[i] + scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T>
T OMPStream<T>::dot()
{

View File

@ -36,6 +36,7 @@ class OMPStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -38,6 +38,15 @@ But this information is not typically available in real HPC codes today, where t
BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice.
BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels).
Details about PRK can be found in the following references:
> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
Website
-------

View File

@ -94,6 +94,20 @@ void STD20Stream<T>::triad()
);
}
template <class T>
void STD20Stream<T>::nstream()
{
const T scalar = startScalar;
std::for_each_n(
std::execution::par_unseq,
std::views::iota(0).begin(), array_size,
[&] (int i) {
a[i] += b[i] + scalar * c[i];
}
);
}
template <class T>
T STD20Stream<T>::dot()
{

View File

@ -33,6 +33,7 @@ class STD20Stream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -72,6 +72,17 @@ void STDStream<T>::triad()
std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
}
template <class T>
void STDStream<T>::nstream()
{
// a[i] += b[i] + scalar * c[i];
// Need to do in two stages with C++11 STL.
// 1: a[i] += b[i]
// 2: a[i] += scalar * c[i];
std::transform(exe_policy, a, a+array_size, b, a, [](T ai, T bi){ return ai + bi; });
std::transform(exe_policy, a, a+array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; });
}
template <class T>
T STDStream<T>::dot()
{

View File

@ -31,6 +31,7 @@ class STDStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -148,6 +148,23 @@ void SYCLStream<T>::triad()
queue->wait();
}
template <class T>
void SYCLStream<T>::nstream()
{
const T scalar = startScalar;
queue->submit([&](handler &cgh)
{
auto ka = d_a->template get_access<access::mode::read_write>(cgh);
auto kb = d_b->template get_access<access::mode::read>(cgh);
auto kc = d_c->template get_access<access::mode::read>(cgh);
cgh.parallel_for<nstream_kernel>(range<1>{array_size}, [=](id<1> idx)
{
ka[idx] += kb[idx] + scalar * kc[idx];
});
});
queue->wait();
}
template <class T>
T SYCLStream<T>::dot()
{

View File

@ -22,6 +22,7 @@ namespace sycl_kernels
template <class T> class mul;
template <class T> class add;
template <class T> class triad;
template <class T> class nstream;
template <class T> class dot;
}
@ -45,6 +46,7 @@ class SYCLStream : public Stream<T>
typedef sycl_kernels::mul<T> mul_kernel;
typedef sycl_kernels::add<T> add_kernel;
typedef sycl_kernels::triad<T> triad_kernel;
typedef sycl_kernels::nstream<T> nstream_kernel;
typedef sycl_kernels::dot<T> dot_kernel;
// NDRange configuration for the dot kernel
@ -60,6 +62,7 @@ class SYCLStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;

View File

@ -29,6 +29,7 @@ class Stream
virtual void mul() = 0;
virtual void add() = 0;
virtual void triad() = 0;
virtual void nstream() = 0;
virtual T dot() = 0;
// Copy memory between host and device

View File

@ -58,13 +58,11 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
template <typename T>
void run();
template <typename T>
void run_triad();
// Options for running the benchmark:
// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
// - Triad only.
enum class Benchmark {All, Triad};
// - Nstream only.
enum class Benchmark {All, Triad, Nstream};
// Selected run options.
Benchmark selection = Benchmark::All;
@ -166,6 +164,27 @@ std::vector<std::vector<double>> run_triad(Stream<T> *stream)
return timings;
}
// Run the Nstream kernel
template <typename T>
std::vector<std::vector<double>> run_nstream(Stream<T> *stream)
{
std::vector<std::vector<double>> timings(1);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Run nstream in loop
for (int k = 0; k < num_times; k++) {
t1 = std::chrono::high_resolution_clock::now();
stream->nstream();
t2 = std::chrono::high_resolution_clock::now();
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
return timings;
}
// Generic run routine
// Runs the kernel(s) and prints output.
@ -275,6 +294,10 @@ void run()
break;
case Benchmark::Triad:
timings = run_triad<T>(stream);
break;
case Benchmark::Nstream:
timings = run_nstream<T>(stream);
break;
};
// Check solutions
@ -313,17 +336,26 @@ void run()
}
if (selection == Benchmark::All)
if (selection == Benchmark::All || selection == Benchmark::Nstream)
{
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
size_t sizes[5] = {
2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE
};
std::vector<std::string> labels;
std::vector<size_t> sizes;
if (selection == Benchmark::All)
{
labels = {"Copy", "Mul", "Add", "Triad", "Dot"};
sizes = {
2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE};
} else if (selection == Benchmark::Nstream)
{
labels = {"Nstream"};
sizes = {4 * sizeof(T) * ARRAY_SIZE };
}
for (int i = 0; i < timings.size(); ++i)
{
@ -416,13 +448,19 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
for (unsigned int i = 0; i < ntimes; i++)
{
// Do STREAM!
if (! (selection == Benchmark::Triad))
if (selection == Benchmark::All)
{
goldC = goldA;
goldB = scalar * goldC;
goldC = goldA + goldB;
goldA = goldB + scalar * goldC;
} else if (selection == Benchmark::Triad)
{
goldA = goldB + scalar * goldC;
} else if (selection == Benchmark::Nstream)
{
goldA += goldB + scalar * goldC;
}
goldA = goldB + scalar * goldC;
}
// Do the reduction
@ -435,7 +473,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
errB /= b.size();
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
errC /= c.size();
double errSum = fabs(sum - goldSum);
double errSum = fabs((sum - goldSum)/goldSum);
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
@ -452,7 +490,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
<< "Validation failed on c[]. Average error " << errC
<< std::endl;
// Check sum to 8 decimal places
if (!(selection == Benchmark::Triad) && errSum > 1.0E-8)
if (selection == Benchmark::All && errSum > 1.0E-8)
std::cerr
<< "Validation failed on sum. Error " << errSum
<< std::endl << std::setprecision(15)
@ -523,6 +561,10 @@ void parseArguments(int argc, char *argv[])
{
selection = Benchmark::Triad;
}
else if (!std::string("--nstream-only").compare(argv[i]))
{
selection = Benchmark::Nstream;
}
else if (!std::string("--csv").compare(argv[i]))
{
output_as_csv = true;
@ -544,6 +586,7 @@ void parseArguments(int argc, char *argv[])
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
std::cout << " --float Use floats (rather than doubles)" << std::endl;
std::cout << " --triad-only Only run triad" << std::endl;
std::cout << " --nstream-only Only run nstream" << std::endl;
std::cout << " --csv Output as csv table" << std::endl;
std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;
std::cout << std::endl;