From 435a104f6e5f93b3c2465a63508af21efade891f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 12 Jan 2021 15:30:41 +0000
Subject: [PATCH 1/5] Check input array size is positive

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/main.cpp b/main.cpp
index 23a4699..fd64546 100644
--- a/main.cpp
+++ b/main.cpp
@@ -545,7 +545,7 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--arraysize").compare(argv[i]) ||
              !std::string("-s").compare(argv[i]))
     {
-      if (++i >= argc || !parseInt(argv[i], &ARRAY_SIZE))
+      if (++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0)
       {
         std::cerr << "Invalid array size." << std::endl;
         exit(EXIT_FAILURE);

From 1e94a41f3c9f9abbb5c7666cfe6381c21eb37e5e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 1 Feb 2021 17:41:30 +0000
Subject: [PATCH 2/5] Add nstream kernel from PRK

PRK has a nstream kernel, which is Triad with a += update.
This means there are 3 reads and a write, which is a higher
read/write ratio. In addition, non-temporal stores for the
write on CPUs will not be beneficial, and so compilers should
take care to emit these for the other kernels, but not these.
---
 Stream.h |  1 +
 main.cpp | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Stream.h b/Stream.h
index ff00a54..eb4ffd4 100644
--- a/Stream.h
+++ b/Stream.h
@@ -29,6 +29,7 @@ class Stream
     virtual void mul() = 0;
     virtual void add() = 0;
     virtual void triad() = 0;
+    virtual void nstream() = 0;
     virtual T dot() = 0;
 
     // Copy memory between host and device
diff --git a/main.cpp b/main.cpp
index fd64546..5b931f7 100644
--- a/main.cpp
+++ b/main.cpp
@@ -186,7 +186,7 @@ void run()
   T sum;
 
   // List of times
-  std::vector<std::vector<double>> timings(5);
+  std::vector<std::vector<double>> timings(6);
 
   // Declare timers
   std::chrono::high_resolution_clock::time_point t1, t2;
@@ -218,11 +218,17 @@ void run()
     t2 = std::chrono::high_resolution_clock::now();
     timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
+    // Execute nstream
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->nstream();
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
     // Execute Dot
     t1 = std::chrono::high_resolution_clock::now();
     sum = stream->dot();
     t2 = std::chrono::high_resolution_clock::now();
-    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[5].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
   }
 
@@ -262,16 +268,17 @@ void run()
 
 
 
-  std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
-  size_t sizes[5] = {
+  std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"};
+  size_t sizes[6] = {
     2 * sizeof(T) * ARRAY_SIZE,
     2 * sizeof(T) * ARRAY_SIZE,
     3 * sizeof(T) * ARRAY_SIZE,
     3 * sizeof(T) * ARRAY_SIZE,
+    4 * sizeof(T) * ARRAY_SIZE,
     2 * sizeof(T) * ARRAY_SIZE
   };
 
-  for (int i = 0; i < 5; i++)
+  for (int i = 0; i < 6; i++)
   {
     // Get min/max; ignore the first result
     auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@@ -473,6 +480,10 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
       goldC = goldA + goldB;
     }
     goldA = goldB + scalar * goldC;
+    if (!triad_only)
+    {
+      goldA += goldB + scalar * goldC;
+    }
   }
 
   // Do the reduction

From cb0c345ad5e27fab90d6d75ee82f9a8fcc7cfa1c Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 2 Feb 2021 11:24:41 +0000
Subject: [PATCH 3/5] Update README with nstream citations

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 6177b02..bc1f5d6 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,15 @@ But this information is not typically available in real HPC codes today, where t
 
 BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice.
 
+BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels).
+Details about PRK can be found in the following references:
+
+> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
+
+> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
+
+> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
+
 
 Website
 -------

From 877f820282dc40511a61f17e61847315abfa1927 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 2 Feb 2021 11:25:14 +0000
Subject: [PATCH 4/5] Revert "Update README with nstream citations"

This reverts commit cb0c345ad5e27fab90d6d75ee82f9a8fcc7cfa1c.
---
 README.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/README.md b/README.md
index bc1f5d6..6177b02 100644
--- a/README.md
+++ b/README.md
@@ -38,15 +38,6 @@ But this information is not typically available in real HPC codes today, where t
 
 BabelStream therefore provides a measure of what memory bandwidth performance can be attained (by a particular programming model) if you follow today's best parallel programming best practice.
 
-BabelStream also includes the nstream kernel from the Parallel Research Kernels (PRK) project, available on [GitHub](https://github.com/ParRes/Kernels).
-Details about PRK can be found in the following references:
-
-> Van der Wijngaart, Rob F., and Timothy G. Mattson. The parallel research kernels. IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 2014.
-
-> R. F. Van der Wijngaart, A. Kayi, J. R. Hammond, G. Jost, T. St. John, S. Sridharan, T. G. Mattson, J. Abercrombie, and J. Nelson. Comparing runtime systems with exascale ambitions using the Parallel Research Kernels. ISC 2016, [DOI: 10.1007/978-3-319-41321-1_17](https://doi.org/10.1007/978-3-319-41321-1_17).
-
-> Jeff R. Hammond and Timothy G. Mattson. Evaluating data parallelism in C++ using the Parallel Research Kernels. IWOCL 2019, [DOI: 10.1145/3318170.3318192](https://doi.org/10.1145/3318170.3318192).
-
 
 Website
 -------

From f99f8d35d92fc565d9e704a44ddd50fa787f18b8 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 2 Feb 2021 11:25:27 +0000
Subject: [PATCH 5/5] Revert "Add nstream kernel from PRK"

This reverts commit 1e94a41f3c9f9abbb5c7666cfe6381c21eb37e5e.
---
 Stream.h |  1 -
 main.cpp | 21 +++++----------------
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/Stream.h b/Stream.h
index eb4ffd4..ff00a54 100644
--- a/Stream.h
+++ b/Stream.h
@@ -29,7 +29,6 @@ class Stream
     virtual void mul() = 0;
     virtual void add() = 0;
     virtual void triad() = 0;
-    virtual void nstream() = 0;
     virtual T dot() = 0;
 
     // Copy memory between host and device
diff --git a/main.cpp b/main.cpp
index 5b931f7..fd64546 100644
--- a/main.cpp
+++ b/main.cpp
@@ -186,7 +186,7 @@ void run()
   T sum;
 
   // List of times
-  std::vector<std::vector<double>> timings(6);
+  std::vector<std::vector<double>> timings(5);
 
   // Declare timers
   std::chrono::high_resolution_clock::time_point t1, t2;
@@ -218,17 +218,11 @@ void run()
     t2 = std::chrono::high_resolution_clock::now();
     timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
-    // Execute nstream
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->nstream();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
     // Execute Dot
     t1 = std::chrono::high_resolution_clock::now();
     sum = stream->dot();
     t2 = std::chrono::high_resolution_clock::now();
-    timings[5].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
   }
 
@@ -268,17 +262,16 @@ void run()
 
 
 
-  std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"};
-  size_t sizes[6] = {
+  std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
+  size_t sizes[5] = {
     2 * sizeof(T) * ARRAY_SIZE,
     2 * sizeof(T) * ARRAY_SIZE,
     3 * sizeof(T) * ARRAY_SIZE,
     3 * sizeof(T) * ARRAY_SIZE,
-    4 * sizeof(T) * ARRAY_SIZE,
     2 * sizeof(T) * ARRAY_SIZE
   };
 
-  for (int i = 0; i < 6; i++)
+  for (int i = 0; i < 5; i++)
   {
     // Get min/max; ignore the first result
     auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@@ -480,10 +473,6 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
       goldC = goldA + goldB;
     }
     goldA = goldB + scalar * goldC;
-    if (!triad_only)
-    {
-      goldA += goldB + scalar * goldC;
-    }
   }
 
   // Do the reduction