From 1e94a41f3c9f9abbb5c7666cfe6381c21eb37e5e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 1 Feb 2021 17:41:30 +0000 Subject: [PATCH] Add nstream kernel from PRK PRK has a nstream kernel, which is Triad with a += update. This means there are 3 reads and a write, which is a higher read/write ratio. In addition, non-temporal stores for the write on CPUs will not be beneficial, and so compilers should take care to emit these for the other kernels, but not these. --- Stream.h | 1 + main.cpp | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Stream.h b/Stream.h index ff00a54..eb4ffd4 100644 --- a/Stream.h +++ b/Stream.h @@ -29,6 +29,7 @@ class Stream virtual void mul() = 0; virtual void add() = 0; virtual void triad() = 0; + virtual void nstream() = 0; virtual T dot() = 0; // Copy memory between host and device diff --git a/main.cpp b/main.cpp index fd64546..5b931f7 100644 --- a/main.cpp +++ b/main.cpp @@ -186,7 +186,7 @@ void run() T sum; // List of times - std::vector> timings(5); + std::vector> timings(6); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -218,11 +218,17 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute nstream + t1 = std::chrono::high_resolution_clock::now(); + stream->nstream(); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Dot t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); t2 = std::chrono::high_resolution_clock::now(); - timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[5].push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -262,16 +268,17 @@ void run() - std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; - size_t sizes[5] = { + std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"}; + size_t sizes[6] = { 2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, + 4 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE }; - for (int i = 0; i < 5; i++) + for (int i = 0; i < 6; i++) { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); @@ -473,6 +480,10 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldC = goldA + goldB; } goldA = goldB + scalar * goldC; + if (!triad_only) + { + goldA += goldB + scalar * goldC; + } } // Do the reduction