From 829aa15da0260c833b03162effe3d84532f58b8a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 7 Dec 2020 10:35:27 +0000 Subject: [PATCH] Allocate driver solution check vectors *after* the main computation Each Stream implementation owns its own data, so the driver code shouldn't allocate a large array just before. On processors with strong NUMA effects and smaller memory capacities per NUMA domain, these checking vectors can result in the main arrays being allocated in the wrong NUMA domain. The fix is to simply move the driver allocation until after the computation has finished and we want to check the answers. This commit only changes the driver; each model will be updated in subsequent commits. Fixes #80. --- CHANGELOG.md | 1 + main.cpp | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3495f2f..52949de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file. ### Changed - Default branch renamed from `master` to `main`. +- Driver now delays allocating large checking vectors until after computation has finished. - Use cl::sycl::id parameters instead of cl::sycl::item. - Update local copy of OpenCL C++ header file. - Ensure correct SYCL queue constructor with explicit async_handler. diff --git a/main.cpp b/main.cpp index d1031cf..6d2f679 100644 --- a/main.cpp +++ b/main.cpp @@ -130,14 +130,6 @@ void run() } - // Create host vectors - std::vector a(ARRAY_SIZE); - std::vector b(ARRAY_SIZE); - std::vector c(ARRAY_SIZE); - - // Result of the Dot kernel - T sum; - Stream *stream; #if defined(CUDA) @@ -184,6 +176,9 @@ void run() stream->init_arrays(startA, startB, startC); + // Result of the Dot kernel + T sum; + // List of times std::vector> timings(5); @@ -226,6 +221,11 @@ void run() } // Check solutions + // Create host vectors + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); + stream->read_arrays(a, b, c); check_solution(num_times, a, b, c, sum); @@ -338,11 +338,6 @@ void run_triad() std::cout.precision(ss); } - // Create host vectors - std::vector a(ARRAY_SIZE); - std::vector b(ARRAY_SIZE); - std::vector c(ARRAY_SIZE); - Stream *stream; #if defined(CUDA) @@ -399,7 +394,13 @@ void run_triad() double runtime = std::chrono::duration_cast >(t2 - t1).count(); // Check solutions + // Create host vectors + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); + T sum = 0.0; + stream->read_arrays(a, b, c); check_solution(num_times, a, b, c, sum);