Add nstream kernel from PRK

PRK has a nstream kernel, which is Triad with a += update.
This means there are 3 reads and a write, which is a higher
read/write ratio. In addition, non-temporal stores for the
write on CPUs will not be beneficial, and so compilers should
take care to emit these for the other kernels, but not these.
This commit is contained in:
Tom Deakin 2021-02-01 17:41:30 +00:00
parent f99f8d35d9
commit bd04e6db3c
2 changed files with 17 additions and 5 deletions

View File

@ -29,6 +29,7 @@ class Stream
virtual void mul() = 0; virtual void mul() = 0;
virtual void add() = 0; virtual void add() = 0;
virtual void triad() = 0; virtual void triad() = 0;
virtual void nstream() = 0;
virtual T dot() = 0; virtual T dot() = 0;
// Copy memory between host and device // Copy memory between host and device

View File

@ -186,7 +186,7 @@ void run()
T sum; T sum;
// List of times // List of times
std::vector<std::vector<double>> timings(5); std::vector<std::vector<double>> timings(6);
// Declare timers // Declare timers
std::chrono::high_resolution_clock::time_point t1, t2; std::chrono::high_resolution_clock::time_point t1, t2;
@ -218,11 +218,17 @@ void run()
t2 = std::chrono::high_resolution_clock::now(); t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count()); timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute nstream
t1 = std::chrono::high_resolution_clock::now();
stream->nstream();
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Dot // Execute Dot
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
sum = stream->dot(); sum = stream->dot();
t2 = std::chrono::high_resolution_clock::now(); t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count()); timings[5].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
} }
@ -262,16 +268,17 @@ void run()
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"};
size_t sizes[5] = { size_t sizes[6] = {
2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE,
4 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE 2 * sizeof(T) * ARRAY_SIZE
}; };
for (int i = 0; i < 5; i++) for (int i = 0; i < 6; i++)
{ {
// Get min/max; ignore the first result // Get min/max; ignore the first result
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@ -473,6 +480,10 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
goldC = goldA + goldB; goldC = goldA + goldB;
} }
goldA = goldB + scalar * goldC; goldA = goldB + scalar * goldC;
if (!triad_only)
{
goldA += goldB + scalar * goldC;
}
} }
// Do the reduction // Do the reduction