From 829aa15da0260c833b03162effe3d84532f58b8a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 7 Dec 2020 10:35:27 +0000
Subject: [PATCH] Allocate driver solution check vectors *after* the main
 computation

Each Stream implementation owns its own data, so the driver code
shouldn't allocate a large array just before. On processors with
strong NUMA effects and smaller memory capacities per NUMA domain,
these checking vectors can result in the main arrays being
allocated in the wrong NUMA domain.

The fix is to simply move the driver allocation until after the
computation has finished and we want to check the answers.

This commit only changes the driver; each model will be updated
in subsequent commits.

Fixes #80.
---
 CHANGELOG.md |  1 +
 main.cpp     | 27 ++++++++++++++-------------
 2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3495f2f..52949de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
 
 ### Changed
 - Default branch renamed from `master` to `main`.
+- Driver now delays allocating large checking vectors until after computation has finished.
 - Use cl::sycl::id parameters instead of cl::sycl::item.
 - Update local copy of OpenCL C++ header file.
 - Ensure correct SYCL queue constructor with explicit async_handler.
diff --git a/main.cpp b/main.cpp
index d1031cf..6d2f679 100644
--- a/main.cpp
+++ b/main.cpp
@@ -130,14 +130,6 @@ void run()
 
   }
 
-  // Create host vectors
-  std::vector<T> a(ARRAY_SIZE);
-  std::vector<T> b(ARRAY_SIZE);
-  std::vector<T> c(ARRAY_SIZE);
-
-  // Result of the Dot kernel
-  T sum;
-
   Stream<T> *stream;
 
 #if defined(CUDA)
@@ -184,6 +176,9 @@ void run()
 
   stream->init_arrays(startA, startB, startC);
 
+  // Result of the Dot kernel
+  T sum;
+
   // List of times
   std::vector<std::vector<double>> timings(5);
 
@@ -226,6 +221,11 @@ void run()
   }
 
   // Check solutions
+  // Create host vectors
+  std::vector<T> a(ARRAY_SIZE);
+  std::vector<T> b(ARRAY_SIZE);
+  std::vector<T> c(ARRAY_SIZE);
+
   stream->read_arrays(a, b, c);
   check_solution<T>(num_times, a, b, c, sum);
 
@@ -338,11 +338,6 @@ void run_triad()
     std::cout.precision(ss);
   }
 
-  // Create host vectors
-  std::vector<T> a(ARRAY_SIZE);
-  std::vector<T> b(ARRAY_SIZE);
-  std::vector<T> c(ARRAY_SIZE);
-
   Stream<T> *stream;
 
 #if defined(CUDA)
@@ -399,7 +394,13 @@ void run_triad()
   double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
 
   // Check solutions
+  // Create host vectors
+  std::vector<T> a(ARRAY_SIZE);
+  std::vector<T> b(ARRAY_SIZE);
+  std::vector<T> c(ARRAY_SIZE);
+
   T sum = 0.0;
+
   stream->read_arrays(a, b, c);
   check_solution<T>(num_times, a, b, c, sum);