Allocate driver solution check vectors *after* the main computation

Each Stream implementation owns its own data, so the driver code shouldn't allocate a large array just before. On processors with strong NUMA effects and smaller memory capacities per NUMA domain, these checking vectors can result in the main arrays being allocated in the wrong NUMA domain. The fix is to simply move the driver allocation until after the computation has finished and we want to check the answers. This commit only changes the driver; each model will be updated in subsequent commits. Fixes #80.
2020-12-07 10:35:27 +00:00 · 2020-12-07 10:35:27 +00:00 · 829aa15da0
commit 829aa15da0
parent f373927ce8
2 changed files with 15 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
 ### Changed
 - Default branch renamed from `master` to `main`.
 - Driver now delays allocating large checking vectors until after computation has finished.
 - Use cl::sycl::id parameters instead of cl::sycl::item.
 - Update local copy of OpenCL C++ header file.
 - Ensure correct SYCL queue constructor with explicit async_handler.
--- a/main.cpp
+++ b/main.cpp
@ -130,14 +130,6 @@ void run()
  }
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE);
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  // Result of the Dot kernel
  T sum;
  Stream<T> *stream;
 #if defined(CUDA)
@ -184,6 +176,9 @@ void run()
  stream->init_arrays(startA, startB, startC);
  // Result of the Dot kernel
  T sum;
  // List of times
  std::vector<std::vector<double>> timings(5);
@ -226,6 +221,11 @@ void run()
  }
  // Check solutions
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE);
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  stream->read_arrays(a, b, c);
  check_solution<T>(num_times, a, b, c, sum);
@ -338,11 +338,6 @@ void run_triad()
    std::cout.precision(ss);
  }
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE);
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  Stream<T> *stream;
 #if defined(CUDA)
@ -399,7 +394,13 @@ void run_triad()
  double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
  // Check solutions
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE);
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  T sum = 0.0;
  stream->read_arrays(a, b, c);
  check_solution<T>(num_times, a, b, c, sum);