397 lines
11 KiB
C++
397 lines
11 KiB
C++
|
|
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
|
|
// University of Bristol HPC
|
|
//
|
|
// For full license terms please see the LICENSE file distributed with this
|
|
// source code
|
|
|
|
#include <iostream>
|
|
#include <vector>
|
|
#include <numeric>
|
|
#include <cmath>
|
|
#include <limits>
|
|
#include <chrono>
|
|
#include <algorithm>
|
|
#include <iomanip>
|
|
#include <cstring>
|
|
|
|
#define VERSION_STRING "3.2"
|
|
|
|
#include "Stream.h"
|
|
|
|
#if defined(CUDA)
|
|
#include "CUDAStream.h"
|
|
#elif defined(HIP)
|
|
#include "HIPStream.h"
|
|
#elif defined(OCL)
|
|
#include "OCLStream.h"
|
|
#elif defined(USE_RAJA)
|
|
#include "RAJAStream.hpp"
|
|
#elif defined(KOKKOS)
|
|
#include "KOKKOSStream.hpp"
|
|
#elif defined(ACC)
|
|
#include "ACCStream.h"
|
|
#elif defined(SYCL)
|
|
#include "SYCLStream.h"
|
|
#elif defined(OMP)
|
|
#include "OMPStream.h"
|
|
#endif
|
|
|
|
// Default size of 2^25
|
|
unsigned int ARRAY_SIZE = 33554432;
|
|
unsigned int num_times = 100;
|
|
unsigned int deviceIndex = 0;
|
|
bool use_float = false;
|
|
bool output_as_csv = false;
|
|
std::string csv_separator = ",";
|
|
|
|
template <typename T>
|
|
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
|
|
|
|
template <typename T>
|
|
void run();
|
|
|
|
void parseArguments(int argc, char *argv[]);
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
|
|
parseArguments(argc, argv);
|
|
|
|
if(!output_as_csv){
|
|
std::cout
|
|
<< "BabelStream" << std::endl
|
|
<< "Version: " << VERSION_STRING << std::endl
|
|
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
|
}
|
|
|
|
// TODO: Fix Kokkos to allow multiple template specializations
|
|
#ifndef KOKKOS
|
|
if (use_float)
|
|
run<float>();
|
|
else
|
|
#endif
|
|
run<double>();
|
|
|
|
}
|
|
|
|
template <typename T>
|
|
void run()
|
|
{
|
|
std::streamsize ss = std::cout.precision();
|
|
|
|
if(!output_as_csv){
|
|
std::cout << "Running kernels " << num_times << " times" << std::endl;
|
|
|
|
if (sizeof(T) == sizeof(float))
|
|
std::cout << "Precision: float" << std::endl;
|
|
else
|
|
std::cout << "Precision: double" << std::endl;
|
|
|
|
|
|
std::cout << std::setprecision(1) << std::fixed
|
|
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
|
|
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
|
|
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
|
|
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
|
|
std::cout.precision(ss);
|
|
|
|
}
|
|
|
|
// Create host vectors
|
|
std::vector<T> a(ARRAY_SIZE);
|
|
std::vector<T> b(ARRAY_SIZE);
|
|
std::vector<T> c(ARRAY_SIZE);
|
|
|
|
// Result of the Dot kernel
|
|
T sum;
|
|
|
|
Stream<T> *stream;
|
|
|
|
#if defined(CUDA)
|
|
// Use the CUDA implementation
|
|
stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(HIP)
|
|
// Use the HIP implementation
|
|
stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(OCL)
|
|
// Use the OpenCL implementation
|
|
stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(USE_RAJA)
|
|
// Use the RAJA implementation
|
|
stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(KOKKOS)
|
|
// Use the Kokkos implementation
|
|
stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(ACC)
|
|
// Use the OpenACC implementation
|
|
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
|
|
|
#elif defined(SYCL)
|
|
// Use the SYCL implementation
|
|
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
|
|
|
|
#elif defined(OMP)
|
|
// Use the OpenMP implementation
|
|
stream = new OMPStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
|
|
|
#endif
|
|
|
|
stream->init_arrays(startA, startB, startC);
|
|
|
|
// List of times
|
|
std::vector<std::vector<double>> timings(5);
|
|
|
|
// Declare timers
|
|
std::chrono::high_resolution_clock::time_point t1, t2;
|
|
|
|
// Main loop
|
|
for (unsigned int k = 0; k < num_times; k++)
|
|
{
|
|
// Execute Copy
|
|
t1 = std::chrono::high_resolution_clock::now();
|
|
stream->copy();
|
|
t2 = std::chrono::high_resolution_clock::now();
|
|
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
|
|
// Execute Mul
|
|
t1 = std::chrono::high_resolution_clock::now();
|
|
stream->mul();
|
|
t2 = std::chrono::high_resolution_clock::now();
|
|
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
|
|
// Execute Add
|
|
t1 = std::chrono::high_resolution_clock::now();
|
|
stream->add();
|
|
t2 = std::chrono::high_resolution_clock::now();
|
|
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
|
|
// Execute Triad
|
|
t1 = std::chrono::high_resolution_clock::now();
|
|
stream->triad();
|
|
t2 = std::chrono::high_resolution_clock::now();
|
|
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
|
|
// Execute Dot
|
|
t1 = std::chrono::high_resolution_clock::now();
|
|
sum = stream->dot();
|
|
t2 = std::chrono::high_resolution_clock::now();
|
|
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
|
|
}
|
|
|
|
// Check solutions
|
|
stream->read_arrays(a, b, c);
|
|
check_solution<T>(num_times, a, b, c, sum);
|
|
|
|
// Display timing results
|
|
if(output_as_csv){
|
|
std::cout
|
|
<< "function" << csv_seperator
|
|
<< "nreps" << csv_seperator
|
|
<< "n_elements" << csv_seperator
|
|
<< "sizeof" << csv_seperator
|
|
<< "max_mbytes_per_sec"<< csv_seperator
|
|
<< "min_runtime" << csv_seperator
|
|
<< "max_runtime" << csv_seperator
|
|
<< "avg_runtime" << std::endl;
|
|
}
|
|
else{
|
|
std::cout
|
|
<< std::left << std::setw(12) << "Function"
|
|
<< std::left << std::setw(12) << "MBytes/sec"
|
|
<< std::left << std::setw(12) << "Min (sec)"
|
|
<< std::left << std::setw(12) << "Max"
|
|
<< std::left << std::setw(12) << "Average"
|
|
<< std::endl
|
|
<< std::fixed;
|
|
}
|
|
|
|
|
|
|
|
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
|
|
size_t sizes[5] = {
|
|
2 * sizeof(T) * ARRAY_SIZE,
|
|
2 * sizeof(T) * ARRAY_SIZE,
|
|
3 * sizeof(T) * ARRAY_SIZE,
|
|
3 * sizeof(T) * ARRAY_SIZE,
|
|
2 * sizeof(T) * ARRAY_SIZE
|
|
};
|
|
|
|
for (int i = 0; i < 5; i++)
|
|
{
|
|
// Get min/max; ignore the first result
|
|
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
|
|
|
|
// Calculate average; ignore the first result
|
|
double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
|
|
|
|
// Display results
|
|
if(output_as_csv){
|
|
std::cout
|
|
<< labels[i] << csv_seperator
|
|
<< num_times << csv_seperator
|
|
<< ARRAY_SIZE << csv_seperator
|
|
<< sizeof(T) << csv_seperator
|
|
<< 1.0E-6 * sizes[i] / (*minmax.first) << csv_seperator
|
|
<< *minmax.first << csv_seperator
|
|
<< *minmax.second << csv_seperator
|
|
<< average << csv_seperator
|
|
<< std::endl;
|
|
}
|
|
else {
|
|
std::cout
|
|
<< std::left << std::setw(12) << labels[i]
|
|
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
|
|
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first
|
|
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second
|
|
<< std::left << std::setw(12) << std::setprecision(5) << average
|
|
<< std::endl;
|
|
}
|
|
}
|
|
|
|
delete stream;
|
|
|
|
}
|
|
|
|
template <typename T>
|
|
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
|
|
{
|
|
// Generate correct solution
|
|
T goldA = startA;
|
|
T goldB = startB;
|
|
T goldC = startC;
|
|
T goldSum = 0.0;
|
|
|
|
const T scalar = startScalar;
|
|
|
|
for (unsigned int i = 0; i < ntimes; i++)
|
|
{
|
|
// Do STREAM!
|
|
goldC = goldA;
|
|
goldB = scalar * goldC;
|
|
goldC = goldA + goldB;
|
|
goldA = goldB + scalar * goldC;
|
|
}
|
|
|
|
// Do the reduction
|
|
goldSum = goldA * goldB * ARRAY_SIZE;
|
|
|
|
// Calculate the average error
|
|
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
|
|
errA /= a.size();
|
|
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
|
|
errB /= b.size();
|
|
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
|
|
errC /= c.size();
|
|
double errSum = fabs(sum - goldSum);
|
|
|
|
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
|
|
|
|
if (errA > epsi)
|
|
std::cerr
|
|
<< "Validation failed on a[]. Average error " << errA
|
|
<< std::endl;
|
|
if (errB > epsi)
|
|
std::cerr
|
|
<< "Validation failed on b[]. Average error " << errB
|
|
<< std::endl;
|
|
if (errC > epsi)
|
|
std::cerr
|
|
<< "Validation failed on c[]. Average error " << errC
|
|
<< std::endl;
|
|
// Check sum to 8 decimal places
|
|
if (errSum > 1.0E-8)
|
|
std::cerr
|
|
<< "Validation failed on sum. Error " << errSum
|
|
<< std::endl << std::setprecision(15)
|
|
<< "Sum was " << sum << " but should be " << goldSum
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
int parseUInt(const char *str, unsigned int *output)
|
|
{
|
|
char *next;
|
|
*output = strtoul(str, &next, 10);
|
|
return !strlen(next);
|
|
}
|
|
|
|
void parseArguments(int argc, char *argv[])
|
|
{
|
|
for (int i = 1; i < argc; i++)
|
|
{
|
|
if (!std::string("--list").compare(argv[i]))
|
|
{
|
|
listDevices();
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
else if (!std::string("--device").compare(argv[i]))
|
|
{
|
|
if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
|
|
{
|
|
std::cerr << "Invalid device index." << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
else if (!std::string("--arraysize").compare(argv[i]) ||
|
|
!std::string("-s").compare(argv[i]))
|
|
{
|
|
if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
|
|
{
|
|
std::cerr << "Invalid array size." << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
else if (!std::string("--numtimes").compare(argv[i]) ||
|
|
!std::string("-n").compare(argv[i]))
|
|
{
|
|
if (++i >= argc || !parseUInt(argv[i], &num_times))
|
|
{
|
|
std::cerr << "Invalid number of times." << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
if (num_times < 2)
|
|
{
|
|
std::cerr << "Number of times must be 2 or more" << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
else if (!std::string("--float").compare(argv[i]))
|
|
{
|
|
use_float = true;
|
|
}
|
|
else if (!std::string("--csv").compare(argv[i]))
|
|
{
|
|
output_as_csv = true;
|
|
}
|
|
else if (!std::string("--help").compare(argv[i]) ||
|
|
!std::string("-h").compare(argv[i]))
|
|
{
|
|
std::cout << std::endl;
|
|
std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
|
|
std::cout << "Options:" << std::endl;
|
|
std::cout << " -h --help Print the message" << std::endl;
|
|
std::cout << " --list List available devices" << std::endl;
|
|
std::cout << " --device INDEX Select device at INDEX" << std::endl;
|
|
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
|
|
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
|
|
std::cout << " --float Use floats (rather than doubles)" << std::endl;
|
|
std::cout << " --csv Output as csv table" << std::endl;
|
|
std::cout << std::endl;
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
else
|
|
{
|
|
std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
|
|
<< std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
}
|