From 8c87f9d010becb1501d960273ee25137862d4938 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 10 Jan 2021 22:43:12 +0000 Subject: [PATCH 01/21] Fixed sycl kernel linking for computecpp --- SYCL.make | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/SYCL.make b/SYCL.make index 4326da5..0554d67 100644 --- a/SYCL.make +++ b/SYCL.make @@ -41,13 +41,13 @@ endif endif -SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) +SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) -no-serial-memop -sycl-driver SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS) SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS) SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64 SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++ -SYCL_COMPUTECPP_FLAGS = -O3 --std=c++17 -SYCL_COMPUTECPP_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -lComputeCpp -lOpenCL -Wl,--rpath=$(SYCL_SDK_DIR)/lib/ +SYCL_COMPUTECPP_FLAGS = -O3 -std=c++17 +SYCL_COMPUTECPP_LINK_FLAGS = -Wl,-rpath=$(SYCL_SDK_DIR)/lib/ $(SYCL_SDK_DIR)/lib/libComputeCpp.so -lOpenCL SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include SYCL_COMPUTECPP_CXX = g++ SYCL_COMPUTECPP_DEPS = SYCLStream.sycl @@ -81,6 +81,13 @@ SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE) SYCL_CXX = $(SYCL_$(COMPILER)_CXX) SYCL_DEPS = $(SYCL_$(COMPILER)_DEPS) +ifeq ($(COMPILER), COMPUTECPP) + +sycl-stream: main.cpp SYCLStream.cpp + $(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@ + +else + sycl-stream: main.o SYCLStream.o $(SYCL_DEPS) $(SYCL_CXX) $(SYCL_FLAGS) -DSYCL main.o SYCLStream.o $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) -o $@ @@ -93,6 +100,8 @@ SYCLStream.o: SYCLStream.cpp $(SYCL_DEPS) SYCLStream.sycl: SYCLStream.cpp $(SYCL_SYCLCXX) -DSYCL SYCLStream.cpp $(SYCL_SYCLFLAGS) -c $(SYCL_INCLUDE) -o $@ +endif + .PHONY: clean clean: rm -f sycl-stream SYCLStream.sycl main.o SYCLStream.o From bd04e6db3c078b5b52509e31d8b5017c2858b232 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 1 Feb 2021 17:41:30 +0000 Subject: [PATCH 02/21] Add nstream kernel from PRK PRK has a nstream kernel, which is Triad with a += update. This means there are 3 reads and a write, which is a higher read/write ratio. In addition, non-temporal stores for the write on CPUs will not be beneficial, and so compilers should take care to emit these for the other kernels, but not these. --- Stream.h | 1 + main.cpp | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Stream.h b/Stream.h index ff00a54..eb4ffd4 100644 --- a/Stream.h +++ b/Stream.h @@ -29,6 +29,7 @@ class Stream virtual void mul() = 0; virtual void add() = 0; virtual void triad() = 0; + virtual void nstream() = 0; virtual T dot() = 0; // Copy memory between host and device diff --git a/main.cpp b/main.cpp index fd64546..5b931f7 100644 --- a/main.cpp +++ b/main.cpp @@ -186,7 +186,7 @@ void run() T sum; // List of times - std::vector> timings(5); + std::vector> timings(6); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -218,11 +218,17 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute nstream + t1 = std::chrono::high_resolution_clock::now(); + stream->nstream(); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Dot t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); t2 = std::chrono::high_resolution_clock::now(); - timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[5].push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -262,16 +268,17 @@ void run() - std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; - size_t sizes[5] = { + std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"}; + size_t sizes[6] = { 2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, + 4 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE }; - for (int i = 0; i < 5; i++) + for (int i = 0; i < 6; i++) { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); @@ -473,6 +480,10 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldC = goldA + goldB; } goldA = goldB + scalar * goldC; + if (!triad_only) + { + goldA += goldB + scalar * goldC; + } } // Do the reduction From 767df86f1a4f10b02d15d9a2eb6fb5e3694b1104 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 11:24:41 +0000 Subject: [PATCH 03/21] Update README with nstream citations --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 6177b02..bc1f5d6 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,15 @@ But this information is not typically available in real HPC codes today, where t BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice. +BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels). +Details about PRK can be found in the following references: + +> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014. + +> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17). + +> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192). + Website ------- From 5346e1226dcea46ffdc981426a24188da02569eb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 11:27:54 +0000 Subject: [PATCH 04/21] Update initial array values to ensure dot product works with the nstream kernel --- CHANGELOG.md | 1 + Stream.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1cfe5e..4eeb473 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ All notable changes to this project will be documented in this file. - Cray compiler OpenMP flags updated. - Clang compiler OpenMP flags corrected for NVIDIA target. - Reorder OpenCL objects in class so destructors are called in safe order. +- Initial values updated to support additional kernel. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/Stream.h b/Stream.h index eb4ffd4..dcaa258 100644 --- a/Stream.h +++ b/Stream.h @@ -11,10 +11,10 @@ #include // Array values -#define startA (0.1) -#define startB (0.2) +#define startA (1.0) +#define startB (0.02) #define startC (0.0) -#define startScalar (0.4) +#define startScalar (0.04) template class Stream From 84406024cf87cbf6400a37bfd3bbd5a633e7f722 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 11:28:33 +0000 Subject: [PATCH 05/21] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4eeb473..0471213 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Kokkos 3 build system (No code changes made). - SYCL build rules for ComputeCpp, DPCPP and HipSYCL. - Support for CUDA Managed Memory and Page Fault memory. +- Added nstream kernel from PRK. ### Changed - Default branch renamed from `master` to `main`. From 4c905e6a8657f1f9a08ecdaa7208eeb15ee8723b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 11:44:37 +0000 Subject: [PATCH 06/21] Add OpenMP nstream kernel --- OMPStream.cpp | 25 +++++++++++++++++++++++++ OMPStream.h | 1 + 2 files changed, 26 insertions(+) diff --git a/OMPStream.cpp b/OMPStream.cpp index 6b2800d..8063987 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -191,6 +191,31 @@ void OMPStream::triad() #endif } +template +void OMPStream::nstream() +{ + const T scalar = startScalar; + +#ifdef OMP_TARGET_GPU + int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd +#else + #pragma omp parallel for +#endif + for (int i = 0; i < array_size; i++) + { + a[i] += b[i] + scalar * c[i]; + } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif +} + template T OMPStream::dot() { diff --git a/OMPStream.h b/OMPStream.h index 8c93986..5a5622f 100644 --- a/OMPStream.h +++ b/OMPStream.h @@ -36,6 +36,7 @@ class OMPStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From bda9525b9525db5a2ddf3a370a4754403323ca69 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 12:29:00 +0000 Subject: [PATCH 07/21] Add SYCL 1.2.1 nstream kernel --- SYCLStream.cpp | 17 +++++++++++++++++ SYCLStream.h | 3 +++ 2 files changed, 20 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 8ab642f..49ad3ac 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -148,6 +148,23 @@ void SYCLStream::triad() queue->wait(); } +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + { + ka[idx] += kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + template T SYCLStream::dot() { diff --git a/SYCLStream.h b/SYCLStream.h index df10946..d3988a7 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -22,6 +22,7 @@ namespace sycl_kernels template class mul; template class add; template class triad; + template class nstream; template class dot; } @@ -45,6 +46,7 @@ class SYCLStream : public Stream typedef sycl_kernels::mul mul_kernel; typedef sycl_kernels::add add_kernel; typedef sycl_kernels::triad triad_kernel; + typedef sycl_kernels::nstream nstream_kernel; typedef sycl_kernels::dot dot_kernel; // NDRange configuration for the dot kernel @@ -60,6 +62,7 @@ class SYCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From 05e3e5a127320e0a47995931b4a1d86c8cee2a24 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 12:32:33 +0000 Subject: [PATCH 08/21] Add CUDA nstream kernel --- CUDAStream.cu | 17 +++++++++++++++++ CUDAStream.h | 1 + 2 files changed, 18 insertions(+) diff --git a/CUDAStream.cu b/CUDAStream.cu index 32aae49..b467d00 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -212,6 +212,23 @@ void CUDAStream::triad() check_error(); } +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] += b[i] + scalar * c[i]; +} + +template +void CUDAStream::nstream() +{ + nstream_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { diff --git a/CUDAStream.h b/CUDAStream.h index df85802..83b8c66 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -50,6 +50,7 @@ class CUDAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From 4203ccb017da2dd03aa41aac32332b20bc56418b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 15:46:53 +0000 Subject: [PATCH 09/21] Add nstream kernel to OpenCL --- OCLStream.cpp | 20 ++++++++++++++++++++ OCLStream.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/OCLStream.cpp b/OCLStream.cpp index 6c88eda..be88ba9 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -61,6 +61,14 @@ std::string kernels{R"CLC( const size_t i = get_global_id(0); a[i] = b[i] + scalar * c[i]; } + kernel void nstream( + global TYPE * restrict a, + global const TYPE * restrict b, + global const TYPE * restrict c) + { + const size_t i = get_global_id(0); + a[i] += b[i] + scalar * c[i]; + } kernel void stream_dot( global const TYPE * restrict a, @@ -157,6 +165,7 @@ OCLStream::OCLStream(const int ARRAY_SIZE, const int device_index) mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + nstream_kernel = new cl::KernelFunctor(program, "nstream"); dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -186,6 +195,7 @@ OCLStream::~OCLStream() delete mul_kernel; delete add_kernel; delete triad_kernel; + delete nstream_kernel; delete dot_kernel; devices.clear(); @@ -231,6 +241,16 @@ void OCLStream::triad() queue.finish(); } +template +void OCLStream::nstream() +{ + (*nstream_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c + ); + queue.finish(); +} + template T OCLStream::dot() { diff --git a/OCLStream.h b/OCLStream.h index 3085aca..bcdf9ac 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -47,6 +47,7 @@ class OCLStream : public Stream cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; + cl::KernelFunctor *nstream_kernel; cl::KernelFunctor *dot_kernel; // NDRange configuration for the dot kernel @@ -62,6 +63,7 @@ class OCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From aefe5af18b610b0c6be22a5a88f92f3adee57bf9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 Feb 2021 15:58:00 +0000 Subject: [PATCH 10/21] Add nstream to Kokkos --- KokkosStream.cpp | 15 +++++++++++++++ KokkosStream.hpp | 1 + 2 files changed, 16 insertions(+) diff --git a/KokkosStream.cpp b/KokkosStream.cpp index 09c1eaf..00efe92 100644 --- a/KokkosStream.cpp +++ b/KokkosStream.cpp @@ -119,6 +119,21 @@ void KokkosStream::triad() Kokkos::fence(); } +template +void KokkosStream::nstream() +{ + Kokkos::View a(*d_a); + Kokkos::View b(*d_b); + Kokkos::View c(*d_c); + + const T scalar = startScalar; + Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const long index) + { + a[index] += b[index] + scalar*c[index]; + }); + Kokkos::fence(); +} + template T KokkosStream::dot() { diff --git a/KokkosStream.hpp b/KokkosStream.hpp index e88622c..3aa7cf5 100644 --- a/KokkosStream.hpp +++ b/KokkosStream.hpp @@ -41,6 +41,7 @@ class KokkosStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From 210cfb7520eb9f2cae5c0a5a5a720eac21b1d8fb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 10:14:58 +0000 Subject: [PATCH 11/21] Revert "Update initial array values to ensure dot product works with the nstream kernel" This reverts commit 5346e1226dcea46ffdc981426a24188da02569eb. Conflicts: CHANGELOG.md --- CHANGELOG.md | 1 - Stream.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07e7810..42dfed8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,6 @@ All notable changes to this project will be documented in this file. - Clang compiler OpenMP flags corrected for NVIDIA target. - Reorder OpenCL objects in class so destructors are called in safe order. - Ensure all OpenCL kernels are present in destructor. -- Initial values updated to support additional kernel. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/Stream.h b/Stream.h index dcaa258..eb4ffd4 100644 --- a/Stream.h +++ b/Stream.h @@ -11,10 +11,10 @@ #include // Array values -#define startA (1.0) -#define startB (0.02) +#define startA (0.1) +#define startB (0.2) #define startC (0.0) -#define startScalar (0.04) +#define startScalar (0.4) template class Stream From 579247dc06321b668e82e345f02479f244f135dd Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 10:16:13 +0000 Subject: [PATCH 12/21] Normalise sum result to mitigate errors with large iteration counts --- CHANGELOG.md | 1 + main.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42dfed8..7611fd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ All notable changes to this project will be documented in this file. - Clang compiler OpenMP flags corrected for NVIDIA target. - Reorder OpenCL objects in class so destructors are called in safe order. - Ensure all OpenCL kernels are present in destructor. +- Normalise sum result by expected value to help false negative errors. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/main.cpp b/main.cpp index 5b931f7..302521f 100644 --- a/main.cpp +++ b/main.cpp @@ -496,7 +496,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector errB /= b.size(); double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); errC /= c.size(); - double errSum = fabs(sum - goldSum); + double errSum = fabs((sum - goldSum)/goldSum); double epsi = std::numeric_limits::epsilon() * 100.0; From 490af521478771fc598515000142144a1d3a22fa Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 10:54:33 +0000 Subject: [PATCH 13/21] Add nstream to C++ STD version -- untested as compilers not ready --- STD20Stream.cpp | 14 ++++++++++++++ STD20Stream.hpp | 1 + STDStream.cpp | 11 +++++++++++ STDStream.h | 1 + 4 files changed, 27 insertions(+) diff --git a/STD20Stream.cpp b/STD20Stream.cpp index 09f83d4..8290033 100644 --- a/STD20Stream.cpp +++ b/STD20Stream.cpp @@ -94,6 +94,20 @@ void STD20Stream::triad() ); } +template +void STD20Stream::nstream() +{ + const T scalar = startScalar; + + std::for_each_n( + std::execution::par_unseq, + std::views::iota(0).begin(), array_size, + [&] (int i) { + a[i] += b[i] + scalar * c[i]; + } + ); +} + template T STD20Stream::dot() { diff --git a/STD20Stream.hpp b/STD20Stream.hpp index a8a3c4f..e5daa3c 100644 --- a/STD20Stream.hpp +++ b/STD20Stream.hpp @@ -33,6 +33,7 @@ class STD20Stream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; diff --git a/STDStream.cpp b/STDStream.cpp index cd966ae..30ad420 100644 --- a/STDStream.cpp +++ b/STDStream.cpp @@ -72,6 +72,17 @@ void STDStream::triad() std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; }); } +template +void STDStream::nstream() +{ + // a[i] += b[i] + scalar * c[i]; + // Need to do in two stages with C++11 STL. + // 1: a[i] += b[i] + // 2: a[i] += scalar * c[i]; + std::transform(exe_policy, a, a+array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a+array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; }); +} + template T STDStream::dot() { diff --git a/STDStream.h b/STDStream.h index 538f857..9ff7800 100644 --- a/STDStream.h +++ b/STDStream.h @@ -31,6 +31,7 @@ class STDStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From 44e74b574bd377c23793b3c78e76d366438a1dc9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 11:21:56 +0000 Subject: [PATCH 14/21] Update initial starting values --- CHANGELOG.md | 1 + Stream.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7611fd8..42a06be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ All notable changes to this project will be documented in this file. - Reorder OpenCL objects in class so destructors are called in safe order. - Ensure all OpenCL kernels are present in destructor. - Normalise sum result by expected value to help false negative errors. +- Update starting values to support new kernel in all models on all devices. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/Stream.h b/Stream.h index eb4ffd4..bb18f1d 100644 --- a/Stream.h +++ b/Stream.h @@ -11,10 +11,10 @@ #include // Array values -#define startA (0.1) -#define startB (0.2) +#define startA (0.001) +#define startB (0.02) #define startC (0.0) -#define startScalar (0.4) +#define startScalar (-0.4) template class Stream From c53b635a3cb1df49dca3e45315515a747d7620ac Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 11:25:26 +0000 Subject: [PATCH 15/21] Add nstream kernel to HIP --- HIPStream.cpp | 17 +++++++++++++++++ HIPStream.h | 1 + 2 files changed, 18 insertions(+) diff --git a/HIPStream.cpp b/HIPStream.cpp index d790ee5..fbc3b71 100644 --- a/HIPStream.cpp +++ b/HIPStream.cpp @@ -182,6 +182,23 @@ void HIPStream::triad() check_error(); } +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] += b[i] + scalar * c[i]; +} + +template +void HIPStream::nstream() +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + check_error(); + hipDeviceSynchronize(); + check_error(); +} + template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { diff --git a/HIPStream.h b/HIPStream.h index fdab392..44a2893 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -41,6 +41,7 @@ class HIPStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From f90d911551fb510ec4a77010c9b4327f95dba7bf Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 3 Feb 2021 16:58:47 +0000 Subject: [PATCH 16/21] Add nstream to OpenACC, but it looks like NVHPC is ignoring the += in the kernel so results are wrong --- ACCStream.cpp | 16 ++++++++++++++++ ACCStream.h | 1 + 2 files changed, 17 insertions(+) diff --git a/ACCStream.cpp b/ACCStream.cpp index 664668d..9e0e3e7 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -120,6 +120,22 @@ void ACCStream::triad() } } +template +void ACCStream::nstream() +{ + const T scalar = startScalar; + + int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] += b[i] + scalar * c[i]; + } +} + template T ACCStream::dot() { diff --git a/ACCStream.h b/ACCStream.h index 3d8695e..4cb9d25 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -35,6 +35,7 @@ class ACCStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; From 94e394845fbbb54cf8f767ed1a8434dd31a3a3d1 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 17 Feb 2021 17:17:20 +0000 Subject: [PATCH 17/21] Clean up SYCL.make with unified build target --- SYCL.make | 52 +++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/SYCL.make b/SYCL.make index 0554d67..58df8d0 100644 --- a/SYCL.make +++ b/SYCL.make @@ -33,75 +33,49 @@ Set ARCH to change device (defaulting to ""). endef -ifneq ($(COMPILER), DPCPP) +ifeq ($(COMPILER), HIPSYCL) +ifneq ($(TARGET), CPU) $(info $(arch_help)) ARCH= - +endif endif endif SYCL_COMPUTECPP_SYCLFLAGS = $(shell $(SYCL_SDK_DIR)/bin/computecpp_info --dump-device-compiler-flags) -no-serial-memop -sycl-driver -SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS) SYCL_COMPUTECPP_SYCLFLAGS_CPU = $(SYCL_COMPUTECPP_SYCLFLAGS) +SYCL_COMPUTECPP_SYCLFLAGS_AMD = $(SYCL_COMPUTECPP_SYCLFLAGS) SYCL_COMPUTECPP_SYCLFLAGS_NVIDIA = $(SYCL_COMPUTECPP_SYCLFLAGS) -sycl-target ptx64 SYCL_COMPUTECPP_SYCLCXX = $(SYCL_SDK_DIR)/bin/compute++ SYCL_COMPUTECPP_FLAGS = -O3 -std=c++17 SYCL_COMPUTECPP_LINK_FLAGS = -Wl,-rpath=$(SYCL_SDK_DIR)/lib/ $(SYCL_SDK_DIR)/lib/libComputeCpp.so -lOpenCL SYCL_COMPUTECPP_INCLUDE = -I$(SYCL_SDK_DIR)/include -SYCL_COMPUTECPP_CXX = g++ -SYCL_COMPUTECPP_DEPS = SYCLStream.sycl -SYCL_HIPSYCL_SYCLFLAGS_CPU = -O3 --std=c++17 --hipsycl-platform=cpu -SYCL_HIPSYCL_SYCLFLAGS_AMD = -O3 --std=c++17 --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH) -SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = -O3 --std=c++17 --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH) +SYCL_HIPSYCL_SYCLFLAGS_CPU = --hipsycl-platform=cpu +SYCL_HIPSYCL_SYCLFLAGS_AMD = --hipsycl-platform=rocm --hipsycl-gpu-arch=$(ARCH) +SYCL_HIPSYCL_SYCLFLAGS_NVIDIA = --hipsycl-platform=cuda --hipsycl-gpu-arch=$(ARCH) SYCL_HIPSYCL_SYCLCXX = $(SYCL_SDK_DIR)/bin/syclcc -SYCL_HIPSYCL_FLAGS = $(SYCL_HIPSYCL_SYCLFLAGS_$(TARGET)) +SYCL_HIPSYCL_FLAGS = -O3 --std=c++17 SYCL_HIPSYCL_LINK_FLAGS = -L$(SYCL_SDK_DIR)/lib -Wl,-rpath,$(SYCL_SDK_DIR)/lib SYCL_HIPSYCL_INCLUDE = -SYCL_HIPSYCL_CXX = $(SYCL_HIPSYCL_SYCLCXX) -SYCL_HIPSYCL_DEPS = -SYCL_DPCPP_SYCLFLAGS_CPU = -O3 --std=c++17 -SYCL_DPCPP_SYCLFLAGS_NVIDIA = -O3 --std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda +SYCL_DPCPP_SYCLFLAGS_NVIDIA = -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda SYCL_DPCPP_SYCLCXX = dpcpp -SYCL_DPCPP_FLAGS = $(SYCL_DPCPP_SYCLFLAGS_CPU) +SYCL_DPCPP_FLAGS = -O3 --std=c++17 SYCL_DPCPP_LINK_FLAGS = SYCL_DPCPP_INCLUDE = -SYCL_DPCPP_CXX = dpcpp -SYCL_DPCPP_DEPS = SYCL_SYCLFLAGS = $(SYCL_$(COMPILER)_SYCLFLAGS_$(TARGET)) SYCL_SYCLCXX = $(SYCL_$(COMPILER)_SYCLCXX) - SYCL_FLAGS = $(SYCL_$(COMPILER)_FLAGS) SYCL_LINK_FLAGS = $(SYCL_$(COMPILER)_LINK_FLAGS) SYCL_INCLUDE = $(SYCL_$(COMPILER)_INCLUDE) -SYCL_CXX = $(SYCL_$(COMPILER)_CXX) -SYCL_DEPS = $(SYCL_$(COMPILER)_DEPS) - -ifeq ($(COMPILER), COMPUTECPP) +# only ComputeCpp generates .sycl files which is a bit odd to deal with so we opted to compile everything together sycl-stream: main.cpp SYCLStream.cpp - $(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@ - -else - -sycl-stream: main.o SYCLStream.o $(SYCL_DEPS) - $(SYCL_CXX) $(SYCL_FLAGS) -DSYCL main.o SYCLStream.o $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) -o $@ - -main.o: main.cpp - $(SYCL_CXX) $(SYCL_FLAGS) -DSYCL main.cpp -c $(SYCL_INCLUDE) $(EXTRA_FLAGS) -o $@ - -SYCLStream.o: SYCLStream.cpp $(SYCL_DEPS) - $(SYCL_CXX) $(SYCL_FLAGS) -DSYCL SYCLStream.cpp -c $(SYCL_INCLUDE) $(EXTRA_FLAGS) -o $@ - -SYCLStream.sycl: SYCLStream.cpp - $(SYCL_SYCLCXX) -DSYCL SYCLStream.cpp $(SYCL_SYCLFLAGS) -c $(SYCL_INCLUDE) -o $@ - -endif + $(SYCL_SYCLCXX) $(SYCL_SYCLFLAGS) $(SYCL_FLAGS) $(SYCL_INCLUDE) -DSYCL $(EXTRA_FLAGS) $(SYCL_LINK_FLAGS) $^ -o $@ .PHONY: clean clean: - rm -f sycl-stream SYCLStream.sycl main.o SYCLStream.o + rm -f sycl-stream From 46bbab6ebb8916d475453c4d96f4c7cd1d51ec8d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 18 Feb 2021 11:06:14 +0000 Subject: [PATCH 18/21] Revert "Update initial starting values" This reverts commit 44e74b574bd377c23793b3c78e76d366438a1dc9. --- CHANGELOG.md | 1 - Stream.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42a06be..7611fd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,6 @@ All notable changes to this project will be documented in this file. - Reorder OpenCL objects in class so destructors are called in safe order. - Ensure all OpenCL kernels are present in destructor. - Normalise sum result by expected value to help false negative errors. -- Update starting values to support new kernel in all models on all devices. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. diff --git a/Stream.h b/Stream.h index bb18f1d..eb4ffd4 100644 --- a/Stream.h +++ b/Stream.h @@ -11,10 +11,10 @@ #include // Array values -#define startA (0.001) -#define startB (0.02) +#define startA (0.1) +#define startB (0.2) #define startC (0.0) -#define startScalar (-0.4) +#define startScalar (0.4) template class Stream From 487e59c6a95c0b35fe9f349ebb3e4c313a9ea091 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 18 Feb 2021 11:37:49 +0000 Subject: [PATCH 19/21] Don't run nstream in the main benchmark --- main.cpp | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/main.cpp b/main.cpp index 302521f..4c6bde6 100644 --- a/main.cpp +++ b/main.cpp @@ -186,7 +186,7 @@ void run() T sum; // List of times - std::vector> timings(6); + std::vector> timings(5); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -218,17 +218,11 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); - // Execute nstream - t1 = std::chrono::high_resolution_clock::now(); - stream->nstream(); - t2 = std::chrono::high_resolution_clock::now(); - timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); - // Execute Dot t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); t2 = std::chrono::high_resolution_clock::now(); - timings[5].push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -268,17 +262,16 @@ void run() - std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"}; - size_t sizes[6] = { + std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "Dot"}; + size_t sizes[5] = { 2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, - 4 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE }; - for (int i = 0; i < 6; i++) + for (int i = 0; i < 5; i++) { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); @@ -480,10 +473,6 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldC = goldA + goldB; } goldA = goldB + scalar * goldC; - if (!triad_only) - { - goldA += goldB + scalar * goldC; - } } // Do the reduction From a6f23ba9005984f2767741cc7bbfa7000b3830f2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 18 Feb 2021 13:32:35 +0000 Subject: [PATCH 20/21] Add option to run nstream in isolation --- main.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/main.cpp b/main.cpp index 197b8e7..e78d7a1 100644 --- a/main.cpp +++ b/main.cpp @@ -58,13 +58,11 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector template void run(); -template -void run_triad(); - // Options for running the benchmark: // - All 5 kernels (Copy, Add, Mul, Triad, Dot). // - Triad only. -enum class Benchmark {All, Triad}; +// - Nstream only. +enum class Benchmark {All, Triad, Nstream}; // Selected run options. Benchmark selection = Benchmark::All; @@ -166,6 +164,27 @@ std::vector> run_triad(Stream *stream) return timings; } +// Run the Nstream kernel +template +std::vector> run_nstream(Stream *stream) +{ + std::vector> timings(1); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Run nstream in loop + for (int k = 0; k < num_times; k++) { + t1 = std::chrono::high_resolution_clock::now(); + stream->nstream(); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); + } + + return timings; + +} + // Generic run routine // Runs the kernel(s) and prints output. @@ -275,6 +294,10 @@ void run() break; case Benchmark::Triad: timings = run_triad(stream); + break; + case Benchmark::Nstream: + timings = run_nstream(stream); + break; }; // Check solutions @@ -313,17 +336,26 @@ void run() } - if (selection == Benchmark::All) + if (selection == Benchmark::All || selection == Benchmark::Nstream) { - std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; - size_t sizes[5] = { - 2 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE - }; + std::vector labels; + std::vector sizes; + + if (selection == Benchmark::All) + { + labels = {"Copy", "Mul", "Add", "Triad", "Dot"}; + sizes = { + 2 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE}; + } else if (selection == Benchmark::Nstream) + { + labels = {"Nstream"}; + sizes = {4 * sizeof(T) * ARRAY_SIZE }; + } for (int i = 0; i < timings.size(); ++i) { @@ -416,13 +448,19 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector for (unsigned int i = 0; i < ntimes; i++) { // Do STREAM! - if (! (selection == Benchmark::Triad)) + if (selection == Benchmark::All) { goldC = goldA; goldB = scalar * goldC; goldC = goldA + goldB; + goldA = goldB + scalar * goldC; + } else if (selection == Benchmark::Triad) + { + goldA = goldB + scalar * goldC; + } else if (selection == Benchmark::Nstream) + { + goldA += goldB + scalar * goldC; } - goldA = goldB + scalar * goldC; } // Do the reduction @@ -452,7 +490,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector << "Validation failed on c[]. Average error " << errC << std::endl; // Check sum to 8 decimal places - if (!(selection == Benchmark::Triad) && errSum > 1.0E-8) + if (selection == Benchmark::All && errSum > 1.0E-8) std::cerr << "Validation failed on sum. Error " << errSum << std::endl << std::setprecision(15) @@ -523,6 +561,10 @@ void parseArguments(int argc, char *argv[]) { selection = Benchmark::Triad; } + else if (!std::string("--nstream-only").compare(argv[i])) + { + selection = Benchmark::Nstream; + } else if (!std::string("--csv").compare(argv[i])) { output_as_csv = true; @@ -544,6 +586,7 @@ void parseArguments(int argc, char *argv[]) std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << " --triad-only Only run triad" << std::endl; + std::cout << " --nstream-only Only run nstream" << std::endl; std::cout << " --csv Output as csv table" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl; std::cout << std::endl; From 13c9e0c1c76f0cef8d9b7e00e5608f8ab9c0ac43 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 18 Feb 2021 13:48:45 +0000 Subject: [PATCH 21/21] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ec5c26..8256774 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. - Kokkos 3 build system (No code changes made). - SYCL build rules for ComputeCpp, DPCPP and HipSYCL. - Support for CUDA Managed Memory and Page Fault memory. -- Added nstream kernel from PRK. +- Added nstream kernel from PRK with associate command line option. ### Changed - Default branch renamed from `master` to `main`.