Add nstream kernel from PRK

PRK has a nstream kernel, which is Triad with a += update.
This means there are 3 reads and a write, which is a higher
read/write ratio. In addition, non-temporal stores for the
write on CPUs will not be beneficial, and so compilers should
take care to emit these for the other kernels, but not these.
This commit is contained in:
Tom Deakin 2021-02-01 17:41:30 +00:00
parent 435a104f6e
commit 1e94a41f3c
2 changed files with 17 additions and 5 deletions

View File

@ -29,6 +29,7 @@ class Stream
virtual void mul() = 0;
virtual void add() = 0;
virtual void triad() = 0;
virtual void nstream() = 0;
virtual T dot() = 0;
// Copy memory between host and device

View File

@ -186,7 +186,7 @@ void run()
T sum;
// List of times
std::vector<std::vector<double>> timings(5);
std::vector<std::vector<double>> timings(6);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
@ -218,11 +218,17 @@ void run()
t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute nstream
t1 = std::chrono::high_resolution_clock::now();
stream->nstream();
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Dot
t1 = std::chrono::high_resolution_clock::now();
sum = stream->dot();
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
timings[5].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
@ -262,16 +268,17 @@ void run()
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
size_t sizes[5] = {
std::string labels[6] = {"Copy", "Mul", "Add", "Triad", "nstream", "Dot"};
size_t sizes[6] = {
2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
4 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE
};
for (int i = 0; i < 5; i++)
for (int i = 0; i < 6; i++)
{
// Get min/max; ignore the first result
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@ -473,6 +480,10 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
goldC = goldA + goldB;
}
goldA = goldB + scalar * goldC;
if (!triad_only)
{
goldA += goldB + scalar * goldC;
}
}
// Do the reduction