Merge branch 'main' into sycl-2020
This commit is contained in:
commit
89c676ac91
@ -23,6 +23,8 @@ All notable changes to this project will be documented in this file.
|
|||||||
- Cray compiler OpenMP flags updated.
|
- Cray compiler OpenMP flags updated.
|
||||||
- Clang compiler OpenMP flags corrected for NVIDIA target.
|
- Clang compiler OpenMP flags corrected for NVIDIA target.
|
||||||
- Reorder OpenCL objects in class so destructors are called in safe order.
|
- Reorder OpenCL objects in class so destructors are called in safe order.
|
||||||
|
- Ensure all OpenCL kernels are present in destructor.
|
||||||
|
- Unified run function in driver code to reduce code duplication, output should be uneffected.
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
|
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
|
||||||
|
|||||||
32
CUDA.make
32
CUDA.make
@ -1,8 +1,38 @@
|
|||||||
CXXFLAGS=-O3
|
CXXFLAGS=-O3
|
||||||
CUDA_CXX=nvcc
|
CUDA_CXX=nvcc
|
||||||
|
|
||||||
|
|
||||||
|
ifndef NVARCH
|
||||||
|
define nvarch_help
|
||||||
|
Set NVARCH to select sm_?? version.
|
||||||
|
Default: sm_60
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(nvarch_help))
|
||||||
|
NVARCH=sm_60
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
ifndef MEM
|
||||||
|
define mem_help
|
||||||
|
Set MEM to select memory mode.
|
||||||
|
Available options:
|
||||||
|
DEFAULT - allocate host and device memory pointers.
|
||||||
|
MANAGED - use CUDA Managed Memory.
|
||||||
|
PAGEFAULT - shared memory, only host pointers allocated.
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(mem_help))
|
||||||
|
MEM=DEFAULT
|
||||||
|
endif
|
||||||
|
|
||||||
|
MEM_MANAGED= -DMANAGED
|
||||||
|
MEM_PAGEFAULT= -DPAGEFAULT
|
||||||
|
MEM_MODE = $(MEM_$(MEM))
|
||||||
|
|
||||||
|
|
||||||
cuda-stream: main.cpp CUDAStream.cu
|
cuda-stream: main.cpp CUDAStream.cu
|
||||||
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
|
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -arch=$(NVARCH) $(MEM_MODE) -DCUDA $^ $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
clean:
|
clean:
|
||||||
|
|||||||
@ -186,6 +186,7 @@ OCLStream<T>::~OCLStream()
|
|||||||
delete mul_kernel;
|
delete mul_kernel;
|
||||||
delete add_kernel;
|
delete add_kernel;
|
||||||
delete triad_kernel;
|
delete triad_kernel;
|
||||||
|
delete dot_kernel;
|
||||||
|
|
||||||
devices.clear();
|
devices.clear();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -80,7 +80,7 @@ OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvi
|
|||||||
OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none
|
OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none
|
||||||
OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa
|
OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa
|
||||||
|
|
||||||
OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always
|
OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always -qopenmp
|
||||||
OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64
|
OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64
|
||||||
|
|
||||||
OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
|
OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
|
||||||
|
|||||||
@ -91,6 +91,14 @@ Results
|
|||||||
|
|
||||||
Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
|
Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
|
||||||
|
|
||||||
|
Contributing
|
||||||
|
------------
|
||||||
|
|
||||||
|
As of v4.0, the `main` branch of this repository will hold the latest released version.
|
||||||
|
|
||||||
|
The `develop` branch will contain unreleased features due for the next (major and/or minor) release of BabelStream.
|
||||||
|
Pull Requests should be made against the `develop` branch.
|
||||||
|
|
||||||
Citing
|
Citing
|
||||||
------
|
------
|
||||||
|
|
||||||
|
|||||||
284
main.cpp
284
main.cpp
@ -48,7 +48,6 @@ int ARRAY_SIZE = 33554432;
|
|||||||
unsigned int num_times = 100;
|
unsigned int num_times = 100;
|
||||||
unsigned int deviceIndex = 0;
|
unsigned int deviceIndex = 0;
|
||||||
bool use_float = false;
|
bool use_float = false;
|
||||||
bool triad_only = false;
|
|
||||||
bool output_as_csv = false;
|
bool output_as_csv = false;
|
||||||
bool mibibytes = false;
|
bool mibibytes = false;
|
||||||
std::string csv_separator = ",";
|
std::string csv_separator = ",";
|
||||||
@ -62,6 +61,14 @@ void run();
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void run_triad();
|
void run_triad();
|
||||||
|
|
||||||
|
// Options for running the benchmark:
|
||||||
|
// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
|
||||||
|
// - Triad only.
|
||||||
|
enum class Benchmark {All, Triad};
|
||||||
|
|
||||||
|
// Selected run options.
|
||||||
|
Benchmark selection = Benchmark::All;
|
||||||
|
|
||||||
void parseArguments(int argc, char *argv[]);
|
void parseArguments(int argc, char *argv[]);
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
@ -77,24 +84,91 @@ int main(int argc, char *argv[])
|
|||||||
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix Kokkos to allow multiple template specializations
|
|
||||||
if (triad_only)
|
|
||||||
{
|
|
||||||
if (use_float)
|
|
||||||
run_triad<float>();
|
|
||||||
else
|
|
||||||
run_triad<double>();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (use_float)
|
if (use_float)
|
||||||
run<float>();
|
run<float>();
|
||||||
else
|
else
|
||||||
run<double>();
|
run<double>();
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Run the 5 main kernels
|
||||||
|
template <typename T>
|
||||||
|
std::vector<std::vector<double>> run_all(Stream<T> *stream, T& sum)
|
||||||
|
{
|
||||||
|
|
||||||
|
// List of times
|
||||||
|
std::vector<std::vector<double>> timings(5);
|
||||||
|
|
||||||
|
// Declare timers
|
||||||
|
std::chrono::high_resolution_clock::time_point t1, t2;
|
||||||
|
|
||||||
|
// Main loop
|
||||||
|
for (unsigned int k = 0; k < num_times; k++)
|
||||||
|
{
|
||||||
|
// Execute Copy
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
stream->copy();
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
||||||
|
|
||||||
|
// Execute Mul
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
stream->mul();
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
||||||
|
|
||||||
|
// Execute Add
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
stream->add();
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
||||||
|
|
||||||
|
// Execute Triad
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
stream->triad();
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
||||||
|
|
||||||
|
// Execute Dot
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
sum = stream->dot();
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compiler should use a move
|
||||||
|
return timings;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the Triad kernel
|
||||||
|
template <typename T>
|
||||||
|
std::vector<std::vector<double>> run_triad(Stream<T> *stream)
|
||||||
|
{
|
||||||
|
|
||||||
|
std::vector<std::vector<double>> timings(1);
|
||||||
|
|
||||||
|
// Declare timers
|
||||||
|
std::chrono::high_resolution_clock::time_point t1, t2;
|
||||||
|
|
||||||
|
// Run triad in loop
|
||||||
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
for (unsigned int k = 0; k < num_times; k++)
|
||||||
|
{
|
||||||
|
stream->triad();
|
||||||
|
}
|
||||||
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
|
||||||
|
timings[0].push_back(runtime);
|
||||||
|
|
||||||
|
return timings;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Generic run routine
|
||||||
|
// Runs the kernel(s) and prints output.
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void run()
|
void run()
|
||||||
{
|
{
|
||||||
@ -102,7 +176,14 @@ void run()
|
|||||||
|
|
||||||
if (!output_as_csv)
|
if (!output_as_csv)
|
||||||
{
|
{
|
||||||
|
if (selection == Benchmark::All)
|
||||||
std::cout << "Running kernels " << num_times << " times" << std::endl;
|
std::cout << "Running kernels " << num_times << " times" << std::endl;
|
||||||
|
else if (selection == Benchmark::Triad)
|
||||||
|
{
|
||||||
|
std::cout << "Running triad " << num_times << " times" << std::endl;
|
||||||
|
std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (sizeof(T) == sizeof(float))
|
if (sizeof(T) == sizeof(float))
|
||||||
std::cout << "Precision: float" << std::endl;
|
std::cout << "Precision: float" << std::endl;
|
||||||
@ -182,49 +263,19 @@ void run()
|
|||||||
|
|
||||||
stream->init_arrays(startA, startB, startC);
|
stream->init_arrays(startA, startB, startC);
|
||||||
|
|
||||||
// Result of the Dot kernel
|
// Result of the Dot kernel, if used.
|
||||||
T sum;
|
T sum = 0.0;
|
||||||
|
|
||||||
// List of times
|
std::vector<std::vector<double>> timings;
|
||||||
std::vector<std::vector<double>> timings(5);
|
|
||||||
|
|
||||||
// Declare timers
|
switch (selection)
|
||||||
std::chrono::high_resolution_clock::time_point t1, t2;
|
|
||||||
|
|
||||||
// Main loop
|
|
||||||
for (unsigned int k = 0; k < num_times; k++)
|
|
||||||
{
|
{
|
||||||
// Execute Copy
|
case Benchmark::All:
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
timings = run_all<T>(stream, sum);
|
||||||
stream->copy();
|
break;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
case Benchmark::Triad:
|
||||||
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
timings = run_triad<T>(stream);
|
||||||
|
};
|
||||||
// Execute Mul
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
|
||||||
stream->mul();
|
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
|
||||||
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
||||||
|
|
||||||
// Execute Add
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
|
||||||
stream->add();
|
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
|
||||||
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
||||||
|
|
||||||
// Execute Triad
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
|
||||||
stream->triad();
|
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
|
||||||
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
||||||
|
|
||||||
// Execute Dot
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
|
||||||
sum = stream->dot();
|
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
|
||||||
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check solutions
|
// Check solutions
|
||||||
// Create host vectors
|
// Create host vectors
|
||||||
@ -232,6 +283,7 @@ void run()
|
|||||||
std::vector<T> b(ARRAY_SIZE);
|
std::vector<T> b(ARRAY_SIZE);
|
||||||
std::vector<T> c(ARRAY_SIZE);
|
std::vector<T> c(ARRAY_SIZE);
|
||||||
|
|
||||||
|
|
||||||
stream->read_arrays(a, b, c);
|
stream->read_arrays(a, b, c);
|
||||||
check_solution<T>(num_times, a, b, c, sum);
|
check_solution<T>(num_times, a, b, c, sum);
|
||||||
|
|
||||||
@ -261,6 +313,8 @@ void run()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (selection == Benchmark::All)
|
||||||
|
{
|
||||||
|
|
||||||
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
|
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
|
||||||
size_t sizes[5] = {
|
size_t sizes[5] = {
|
||||||
@ -271,7 +325,7 @@ void run()
|
|||||||
2 * sizeof(T) * ARRAY_SIZE
|
2 * sizeof(T) * ARRAY_SIZE
|
||||||
};
|
};
|
||||||
|
|
||||||
for (int i = 0; i < 5; i++)
|
for (int i = 0; i < timings.size(); ++i)
|
||||||
{
|
{
|
||||||
// Get min/max; ignore the first result
|
// Get min/max; ignore the first result
|
||||||
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
|
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
|
||||||
@ -305,118 +359,11 @@ void run()
|
|||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (selection == Benchmark::Triad)
|
||||||
delete stream;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void run_triad()
|
|
||||||
{
|
|
||||||
|
|
||||||
if (!output_as_csv)
|
|
||||||
{
|
{
|
||||||
std::cout << "Running triad " << num_times << " times" << std::endl;
|
|
||||||
std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
|
|
||||||
|
|
||||||
if (sizeof(T) == sizeof(float))
|
|
||||||
std::cout << "Precision: float" << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << "Precision: double" << std::endl;
|
|
||||||
|
|
||||||
std::streamsize ss = std::cout.precision();
|
|
||||||
if (mibibytes)
|
|
||||||
{
|
|
||||||
std::cout << std::setprecision(1) << std::fixed
|
|
||||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
|
|
||||||
<< " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
|
|
||||||
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
|
|
||||||
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
std::cout << std::setprecision(1) << std::fixed
|
|
||||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
|
|
||||||
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
|
|
||||||
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
|
|
||||||
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
|
|
||||||
}
|
|
||||||
std::cout.precision(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
Stream<T> *stream;
|
|
||||||
|
|
||||||
#if defined(CUDA)
|
|
||||||
// Use the CUDA implementation
|
|
||||||
stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(HIP)
|
|
||||||
// Use the HIP implementation
|
|
||||||
stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(OCL)
|
|
||||||
// Use the OpenCL implementation
|
|
||||||
stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(USE_RAJA)
|
|
||||||
// Use the RAJA implementation
|
|
||||||
stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(KOKKOS)
|
|
||||||
// Use the Kokkos implementation
|
|
||||||
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(ACC)
|
|
||||||
// Use the OpenACC implementation
|
|
||||||
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(STD)
|
|
||||||
// Use the STD implementation
|
|
||||||
stream = new STDStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(STD20)
|
|
||||||
// Use the C++20 implementation
|
|
||||||
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(SYCL)
|
|
||||||
// Use the SYCL implementation
|
|
||||||
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#elif defined(OMP)
|
|
||||||
// Use the OpenMP implementation
|
|
||||||
stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
stream->init_arrays(startA, startB, startC);
|
|
||||||
|
|
||||||
// Declare timers
|
|
||||||
std::chrono::high_resolution_clock::time_point t1, t2;
|
|
||||||
|
|
||||||
// Run triad in loop
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
|
||||||
for (unsigned int k = 0; k < num_times; k++)
|
|
||||||
{
|
|
||||||
stream->triad();
|
|
||||||
}
|
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
|
|
||||||
|
|
||||||
// Check solutions
|
|
||||||
// Create host vectors
|
|
||||||
std::vector<T> a(ARRAY_SIZE);
|
|
||||||
std::vector<T> b(ARRAY_SIZE);
|
|
||||||
std::vector<T> c(ARRAY_SIZE);
|
|
||||||
|
|
||||||
T sum = 0.0;
|
|
||||||
|
|
||||||
stream->read_arrays(a, b, c);
|
|
||||||
check_solution<T>(num_times, a, b, c, sum);
|
|
||||||
|
|
||||||
// Display timing results
|
// Display timing results
|
||||||
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
|
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
|
||||||
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / runtime);
|
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
|
||||||
|
|
||||||
if (output_as_csv)
|
if (output_as_csv)
|
||||||
{
|
{
|
||||||
@ -434,7 +381,7 @@ void run_triad()
|
|||||||
<< ARRAY_SIZE << csv_separator
|
<< ARRAY_SIZE << csv_separator
|
||||||
<< sizeof(T) << csv_separator
|
<< sizeof(T) << csv_separator
|
||||||
<< bandwidth << csv_separator
|
<< bandwidth << csv_separator
|
||||||
<< runtime
|
<< timings[0][0]
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -443,15 +390,18 @@ void run_triad()
|
|||||||
<< "--------------------------------"
|
<< "--------------------------------"
|
||||||
<< std::endl << std::fixed
|
<< std::endl << std::fixed
|
||||||
<< "Runtime (seconds): " << std::left << std::setprecision(5)
|
<< "Runtime (seconds): " << std::left << std::setprecision(5)
|
||||||
<< runtime << std::endl
|
<< timings[0][0] << std::endl
|
||||||
<< "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): "
|
<< "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): "
|
||||||
<< std::left << std::setprecision(3)
|
<< std::left << std::setprecision(3)
|
||||||
<< bandwidth << std::endl;
|
<< bandwidth << std::endl;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
delete stream;
|
delete stream;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
|
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
|
||||||
{
|
{
|
||||||
@ -466,7 +416,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
|
|||||||
for (unsigned int i = 0; i < ntimes; i++)
|
for (unsigned int i = 0; i < ntimes; i++)
|
||||||
{
|
{
|
||||||
// Do STREAM!
|
// Do STREAM!
|
||||||
if (!triad_only)
|
if (! (selection == Benchmark::Triad))
|
||||||
{
|
{
|
||||||
goldC = goldA;
|
goldC = goldA;
|
||||||
goldB = scalar * goldC;
|
goldB = scalar * goldC;
|
||||||
@ -502,7 +452,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
|
|||||||
<< "Validation failed on c[]. Average error " << errC
|
<< "Validation failed on c[]. Average error " << errC
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
// Check sum to 8 decimal places
|
// Check sum to 8 decimal places
|
||||||
if (!triad_only && errSum > 1.0E-8)
|
if (!(selection == Benchmark::Triad) && errSum > 1.0E-8)
|
||||||
std::cerr
|
std::cerr
|
||||||
<< "Validation failed on sum. Error " << errSum
|
<< "Validation failed on sum. Error " << errSum
|
||||||
<< std::endl << std::setprecision(15)
|
<< std::endl << std::setprecision(15)
|
||||||
@ -571,7 +521,7 @@ void parseArguments(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
else if (!std::string("--triad-only").compare(argv[i]))
|
else if (!std::string("--triad-only").compare(argv[i]))
|
||||||
{
|
{
|
||||||
triad_only = true;
|
selection = Benchmark::Triad;
|
||||||
}
|
}
|
||||||
else if (!std::string("--csv").compare(argv[i]))
|
else if (!std::string("--csv").compare(argv[i]))
|
||||||
{
|
{
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user