Merge branch 'main' into sycl-2020

This commit is contained in:
Tom Deakin 2021-02-22 15:15:32 +00:00
commit 89c676ac91
6 changed files with 221 additions and 230 deletions

View File

@ -23,6 +23,8 @@ All notable changes to this project will be documented in this file.
- Cray compiler OpenMP flags updated. - Cray compiler OpenMP flags updated.
- Clang compiler OpenMP flags corrected for NVIDIA target. - Clang compiler OpenMP flags corrected for NVIDIA target.
- Reorder OpenCL objects in class so destructors are called in safe order. - Reorder OpenCL objects in class so destructors are called in safe order.
- Ensure all OpenCL kernels are present in destructor.
- Unified run function in driver code to reduce code duplication, output should be uneffected.
### Removed ### Removed
- Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.

View File

@ -1,8 +1,38 @@
CXXFLAGS=-O3 CXXFLAGS=-O3
CUDA_CXX=nvcc CUDA_CXX=nvcc
ifndef NVARCH
define nvarch_help
Set NVARCH to select sm_?? version.
Default: sm_60
endef
$(info $(nvarch_help))
NVARCH=sm_60
endif
ifndef MEM
define mem_help
Set MEM to select memory mode.
Available options:
DEFAULT - allocate host and device memory pointers.
MANAGED - use CUDA Managed Memory.
PAGEFAULT - shared memory, only host pointers allocated.
endef
$(info $(mem_help))
MEM=DEFAULT
endif
MEM_MANAGED= -DMANAGED
MEM_PAGEFAULT= -DPAGEFAULT
MEM_MODE = $(MEM_$(MEM))
cuda-stream: main.cpp CUDAStream.cu cuda-stream: main.cpp CUDAStream.cu
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@ $(CUDA_CXX) -std=c++11 $(CXXFLAGS) -arch=$(NVARCH) $(MEM_MODE) -DCUDA $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean .PHONY: clean
clean: clean:

View File

@ -186,6 +186,7 @@ OCLStream<T>::~OCLStream()
delete mul_kernel; delete mul_kernel;
delete add_kernel; delete add_kernel;
delete triad_kernel; delete triad_kernel;
delete dot_kernel;
devices.clear(); devices.clear();
} }

View File

@ -80,7 +80,7 @@ OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvi
OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none
OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa
OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always -qopenmp
OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64 OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64
OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906

View File

@ -91,6 +91,14 @@ Results
Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request. Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
Contributing
------------
As of v4.0, the `main` branch of this repository will hold the latest released version.
The `develop` branch will contain unreleased features due for the next (major and/or minor) release of BabelStream.
Pull Requests should be made against the `develop` branch.
Citing Citing
------ ------

284
main.cpp
View File

@ -48,7 +48,6 @@ int ARRAY_SIZE = 33554432;
unsigned int num_times = 100; unsigned int num_times = 100;
unsigned int deviceIndex = 0; unsigned int deviceIndex = 0;
bool use_float = false; bool use_float = false;
bool triad_only = false;
bool output_as_csv = false; bool output_as_csv = false;
bool mibibytes = false; bool mibibytes = false;
std::string csv_separator = ","; std::string csv_separator = ",";
@ -62,6 +61,14 @@ void run();
template <typename T> template <typename T>
void run_triad(); void run_triad();
// Options for running the benchmark:
// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
// - Triad only.
enum class Benchmark {All, Triad};
// Selected run options.
Benchmark selection = Benchmark::All;
void parseArguments(int argc, char *argv[]); void parseArguments(int argc, char *argv[]);
int main(int argc, char *argv[]) int main(int argc, char *argv[])
@ -77,24 +84,91 @@ int main(int argc, char *argv[])
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl; << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
} }
// TODO: Fix Kokkos to allow multiple template specializations
if (triad_only)
{
if (use_float)
run_triad<float>();
else
run_triad<double>();
}
else
{
if (use_float) if (use_float)
run<float>(); run<float>();
else else
run<double>(); run<double>();
}
} }
// Run the 5 main kernels
template <typename T>
std::vector<std::vector<double>> run_all(Stream<T> *stream, T& sum)
{
// List of times
std::vector<std::vector<double>> timings(5);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < num_times; k++)
{
// Execute Copy
t1 = std::chrono::high_resolution_clock::now();
stream->copy();
t2 = std::chrono::high_resolution_clock::now();
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Mul
t1 = std::chrono::high_resolution_clock::now();
stream->mul();
t2 = std::chrono::high_resolution_clock::now();
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Add
t1 = std::chrono::high_resolution_clock::now();
stream->add();
t2 = std::chrono::high_resolution_clock::now();
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Triad
t1 = std::chrono::high_resolution_clock::now();
stream->triad();
t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Dot
t1 = std::chrono::high_resolution_clock::now();
sum = stream->dot();
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
// Compiler should use a move
return timings;
}
// Run the Triad kernel
template <typename T>
std::vector<std::vector<double>> run_triad(Stream<T> *stream)
{
std::vector<std::vector<double>> timings(1);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Run triad in loop
t1 = std::chrono::high_resolution_clock::now();
for (unsigned int k = 0; k < num_times; k++)
{
stream->triad();
}
t2 = std::chrono::high_resolution_clock::now();
double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
timings[0].push_back(runtime);
return timings;
}
// Generic run routine
// Runs the kernel(s) and prints output.
template <typename T> template <typename T>
void run() void run()
{ {
@ -102,7 +176,14 @@ void run()
if (!output_as_csv) if (!output_as_csv)
{ {
if (selection == Benchmark::All)
std::cout << "Running kernels " << num_times << " times" << std::endl; std::cout << "Running kernels " << num_times << " times" << std::endl;
else if (selection == Benchmark::Triad)
{
std::cout << "Running triad " << num_times << " times" << std::endl;
std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
}
if (sizeof(T) == sizeof(float)) if (sizeof(T) == sizeof(float))
std::cout << "Precision: float" << std::endl; std::cout << "Precision: float" << std::endl;
@ -182,49 +263,19 @@ void run()
stream->init_arrays(startA, startB, startC); stream->init_arrays(startA, startB, startC);
// Result of the Dot kernel // Result of the Dot kernel, if used.
T sum; T sum = 0.0;
// List of times std::vector<std::vector<double>> timings;
std::vector<std::vector<double>> timings(5);
// Declare timers switch (selection)
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < num_times; k++)
{ {
// Execute Copy case Benchmark::All:
t1 = std::chrono::high_resolution_clock::now(); timings = run_all<T>(stream, sum);
stream->copy(); break;
t2 = std::chrono::high_resolution_clock::now(); case Benchmark::Triad:
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count()); timings = run_triad<T>(stream);
};
// Execute Mul
t1 = std::chrono::high_resolution_clock::now();
stream->mul();
t2 = std::chrono::high_resolution_clock::now();
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Add
t1 = std::chrono::high_resolution_clock::now();
stream->add();
t2 = std::chrono::high_resolution_clock::now();
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Triad
t1 = std::chrono::high_resolution_clock::now();
stream->triad();
t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Dot
t1 = std::chrono::high_resolution_clock::now();
sum = stream->dot();
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
// Check solutions // Check solutions
// Create host vectors // Create host vectors
@ -232,6 +283,7 @@ void run()
std::vector<T> b(ARRAY_SIZE); std::vector<T> b(ARRAY_SIZE);
std::vector<T> c(ARRAY_SIZE); std::vector<T> c(ARRAY_SIZE);
stream->read_arrays(a, b, c); stream->read_arrays(a, b, c);
check_solution<T>(num_times, a, b, c, sum); check_solution<T>(num_times, a, b, c, sum);
@ -261,6 +313,8 @@ void run()
} }
if (selection == Benchmark::All)
{
std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
size_t sizes[5] = { size_t sizes[5] = {
@ -271,7 +325,7 @@ void run()
2 * sizeof(T) * ARRAY_SIZE 2 * sizeof(T) * ARRAY_SIZE
}; };
for (int i = 0; i < 5; i++) for (int i = 0; i < timings.size(); ++i)
{ {
// Get min/max; ignore the first result // Get min/max; ignore the first result
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@ -305,118 +359,11 @@ void run()
<< std::endl; << std::endl;
} }
} }
} else if (selection == Benchmark::Triad)
delete stream;
}
template <typename T>
void run_triad()
{ {
if (!output_as_csv)
{
std::cout << "Running triad " << num_times << " times" << std::endl;
std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
if (sizeof(T) == sizeof(float))
std::cout << "Precision: float" << std::endl;
else
std::cout << "Precision: double" << std::endl;
std::streamsize ss = std::cout.precision();
if (mibibytes)
{
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
<< " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
}
else
{
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
}
std::cout.precision(ss);
}
Stream<T> *stream;
#if defined(CUDA)
// Use the CUDA implementation
stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(HIP)
// Use the HIP implementation
stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OCL)
// Use the OpenCL implementation
stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(USE_RAJA)
// Use the RAJA implementation
stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(KOKKOS)
// Use the Kokkos implementation
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(STD)
// Use the STD implementation
stream = new STDStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(STD20)
// Use the C++20 implementation
stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(SYCL)
// Use the SYCL implementation
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OMP)
// Use the OpenMP implementation
stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
#endif
stream->init_arrays(startA, startB, startC);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Run triad in loop
t1 = std::chrono::high_resolution_clock::now();
for (unsigned int k = 0; k < num_times; k++)
{
stream->triad();
}
t2 = std::chrono::high_resolution_clock::now();
double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
// Check solutions
// Create host vectors
std::vector<T> a(ARRAY_SIZE);
std::vector<T> b(ARRAY_SIZE);
std::vector<T> c(ARRAY_SIZE);
T sum = 0.0;
stream->read_arrays(a, b, c);
check_solution<T>(num_times, a, b, c, sum);
// Display timing results // Display timing results
double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / runtime); double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
if (output_as_csv) if (output_as_csv)
{ {
@ -434,7 +381,7 @@ void run_triad()
<< ARRAY_SIZE << csv_separator << ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator << sizeof(T) << csv_separator
<< bandwidth << csv_separator << bandwidth << csv_separator
<< runtime << timings[0][0]
<< std::endl; << std::endl;
} }
else else
@ -443,15 +390,18 @@ void run_triad()
<< "--------------------------------" << "--------------------------------"
<< std::endl << std::fixed << std::endl << std::fixed
<< "Runtime (seconds): " << std::left << std::setprecision(5) << "Runtime (seconds): " << std::left << std::setprecision(5)
<< runtime << std::endl << timings[0][0] << std::endl
<< "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): " << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): "
<< std::left << std::setprecision(3) << std::left << std::setprecision(3)
<< bandwidth << std::endl; << bandwidth << std::endl;
} }
}
delete stream; delete stream;
} }
template <typename T> template <typename T>
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum) void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
{ {
@ -466,7 +416,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
for (unsigned int i = 0; i < ntimes; i++) for (unsigned int i = 0; i < ntimes; i++)
{ {
// Do STREAM! // Do STREAM!
if (!triad_only) if (! (selection == Benchmark::Triad))
{ {
goldC = goldA; goldC = goldA;
goldB = scalar * goldC; goldB = scalar * goldC;
@ -502,7 +452,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
<< "Validation failed on c[]. Average error " << errC << "Validation failed on c[]. Average error " << errC
<< std::endl; << std::endl;
// Check sum to 8 decimal places // Check sum to 8 decimal places
if (!triad_only && errSum > 1.0E-8) if (!(selection == Benchmark::Triad) && errSum > 1.0E-8)
std::cerr std::cerr
<< "Validation failed on sum. Error " << errSum << "Validation failed on sum. Error " << errSum
<< std::endl << std::setprecision(15) << std::endl << std::setprecision(15)
@ -571,7 +521,7 @@ void parseArguments(int argc, char *argv[])
} }
else if (!std::string("--triad-only").compare(argv[i])) else if (!std::string("--triad-only").compare(argv[i]))
{ {
triad_only = true; selection = Benchmark::Triad;
} }
else if (!std::string("--csv").compare(argv[i])) else if (!std::string("--csv").compare(argv[i]))
{ {