Merge branch 'main' into sycl-2020

2021-02-22 15:15:32 +00:00 · 2021-02-22 15:15:32 +00:00 · 89c676ac91
commit 89c676ac91
parent ae8bd6081b 683b8fcf88
6 changed files with 221 additions and 230 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -23,6 +23,8 @@ All notable changes to this project will be documented in this file.
 - Cray compiler OpenMP flags updated.
 - Clang compiler OpenMP flags corrected for NVIDIA target.
 - Reorder OpenCL objects in class so destructors are called in safe order.
 - Ensure all OpenCL kernels are present in destructor.
 - Unified run function in driver code to reduce code duplication, output should be uneffected.
 ### Removed
 - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1.
--- a/CUDA.make
+++ b/CUDA.make
@ -1,8 +1,38 @@
 CXXFLAGS=-O3
 CUDA_CXX=nvcc
 ifndef NVARCH
 define nvarch_help
 Set NVARCH to select sm_?? version.
 Default: sm_60
 endef
 $(info $(nvarch_help))
 NVARCH=sm_60
 endif
 ifndef MEM
 define mem_help
 Set MEM to select memory mode.
 Available options:
  DEFAULT   - allocate host and device memory pointers.
  MANAGED   - use CUDA Managed Memory.
  PAGEFAULT - shared memory, only host pointers allocated.
 endef
 $(info $(mem_help))
 MEM=DEFAULT
 endif
 MEM_MANAGED= -DMANAGED
 MEM_PAGEFAULT= -DPAGEFAULT
 MEM_MODE = $(MEM_$(MEM))
 cuda-stream: main.cpp CUDAStream.cu
-	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
+	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -arch=$(NVARCH) $(MEM_MODE) -DCUDA $^ $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@ -186,6 +186,7 @@ OCLStream<T>::~OCLStream()
  delete mul_kernel;
  delete add_kernel;
  delete triad_kernel;
  delete dot_kernel;
  devices.clear();
 }
--- a/OpenMP.make
+++ b/OpenMP.make
@ -80,7 +80,7 @@ OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvi
 OMP_GNU_NVIDIA = -DOMP_TARGET_GPU -fopenmp -foffload=nvptx-none
 OMP_GNU_AMD = -DOMP_TARGET_GPU -fopenmp -foffload=amdgcn-amdhsa
-OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always
+OMP_INTEL_CPU = -xHOST -qopt-streaming-stores=always -qopenmp
 OMP_INTEL_INTEL_GPU = -DOMP_TARGET_GPU -qnextgen -fiopenmp -fopenmp-targets=spir64
 OMP_AOMP_GPU = -DOMP_TARGET_GPU -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906
--- a/README.md
+++ b/README.md
@ -91,6 +91,14 @@ Results
 Sample results can be found in the `results` subdirectory. If you would like to submit updated results, please submit a Pull Request.
 Contributing
 ------------
 As of v4.0, the `main` branch of this repository will hold the latest released version.
 The `develop` branch will contain unreleased features due for the next (major and/or minor) release of BabelStream.
 Pull Requests should be made against the `develop` branch.
 Citing
 ------
--- a/main.cpp
+++ b/main.cpp
@ -48,7 +48,6 @@ int ARRAY_SIZE = 33554432;
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;
 bool triad_only = false;
 bool output_as_csv = false;
 bool mibibytes = false;
 std::string csv_separator = ",";
@ -62,6 +61,14 @@ void run();
 template <typename T>
 void run_triad();
 // Options for running the benchmark:
 // - All 5 kernels (Copy, Add, Mul, Triad, Dot).
 // - Triad only.
 enum class Benchmark {All, Triad};
 // Selected run options.
 Benchmark selection = Benchmark::All;
 void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
@ -77,24 +84,91 @@ int main(int argc, char *argv[])
      << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
  }
  // TODO: Fix Kokkos to allow multiple template specializations
  if (triad_only)
  {
    if (use_float)
      run_triad<float>();
    else
      run_triad<double>();
  }
  else
  {
  if (use_float)
    run<float>();
  else
    run<double>(); 
  }
 }
 // Run the 5 main kernels
 template <typename T>
 std::vector<std::vector<double>> run_all(Stream<T> *stream, T& sum)
 {
  // List of times
  std::vector<std::vector<double>> timings(5);
  // Declare timers
  std::chrono::high_resolution_clock::time_point t1, t2;
  // Main loop
  for (unsigned int k = 0; k < num_times; k++)
  {
    // Execute Copy
    t1 = std::chrono::high_resolution_clock::now();
    stream->copy();
    t2 = std::chrono::high_resolution_clock::now();
    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Mul
    t1 = std::chrono::high_resolution_clock::now();
    stream->mul();
    t2 = std::chrono::high_resolution_clock::now();
    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Add
    t1 = std::chrono::high_resolution_clock::now();
    stream->add();
    t2 = std::chrono::high_resolution_clock::now();
    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Triad
    t1 = std::chrono::high_resolution_clock::now();
    stream->triad();
    t2 = std::chrono::high_resolution_clock::now();
    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Dot
    t1 = std::chrono::high_resolution_clock::now();
    sum = stream->dot();
    t2 = std::chrono::high_resolution_clock::now();
    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
  }
  // Compiler should use a move
  return timings;
 }
 // Run the Triad kernel
 template <typename T>
 std::vector<std::vector<double>> run_triad(Stream<T> *stream)
 {
  std::vector<std::vector<double>> timings(1);
  // Declare timers
  std::chrono::high_resolution_clock::time_point t1, t2;
  // Run triad in loop
  t1 = std::chrono::high_resolution_clock::now();
  for (unsigned int k = 0; k < num_times; k++)
  {
    stream->triad();
  }
  t2 = std::chrono::high_resolution_clock::now();
  double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
  timings[0].push_back(runtime);
  return timings;
 }
 // Generic run routine
 // Runs the kernel(s) and prints output.
 template <typename T>
 void run()
 {
@ -102,7 +176,14 @@ void run()
  if (!output_as_csv)
  {
    if (selection == Benchmark::All)
      std::cout << "Running kernels " << num_times << " times" << std::endl;
    else if (selection == Benchmark::Triad)
    {
      std::cout << "Running triad " << num_times << " times" << std::endl;
      std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
    }
    if (sizeof(T) == sizeof(float))
      std::cout << "Precision: float" << std::endl;
@ -182,49 +263,19 @@ void run()
  stream->init_arrays(startA, startB, startC);
-  // Result of the Dot kernel
+  // Result of the Dot kernel, if used.
-  T sum;
+  T sum = 0.0;
-  // List of times
+  std::vector<std::vector<double>> timings;
  std::vector<std::vector<double>> timings(5);
-  // Declare timers
+  switch (selection)
  std::chrono::high_resolution_clock::time_point t1, t2;
  // Main loop
  for (unsigned int k = 0; k < num_times; k++)
  {
-    // Execute Copy
+    case Benchmark::All:
-    t1 = std::chrono::high_resolution_clock::now();
+      timings = run_all<T>(stream, sum);
-    stream->copy();
+      break;
-    t2 = std::chrono::high_resolution_clock::now();
+    case Benchmark::Triad:
-    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+      timings = run_triad<T>(stream);
-
+  };
    // Execute Mul
    t1 = std::chrono::high_resolution_clock::now();
    stream->mul();
    t2 = std::chrono::high_resolution_clock::now();
    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Add
    t1 = std::chrono::high_resolution_clock::now();
    stream->add();
    t2 = std::chrono::high_resolution_clock::now();
    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Triad
    t1 = std::chrono::high_resolution_clock::now();
    stream->triad();
    t2 = std::chrono::high_resolution_clock::now();
    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Dot
    t1 = std::chrono::high_resolution_clock::now();
    sum = stream->dot();
    t2 = std::chrono::high_resolution_clock::now();
    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
  }
  // Check solutions
  // Create host vectors
@ -232,6 +283,7 @@ void run()
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  stream->read_arrays(a, b, c);
  check_solution<T>(num_times, a, b, c, sum);
@ -261,6 +313,8 @@ void run()
  }
  if (selection == Benchmark::All)
  {
    std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
    size_t sizes[5] = {
@ -271,7 +325,7 @@ void run()
      2 * sizeof(T) * ARRAY_SIZE
    };
-  for (int i = 0; i < 5; i++)
+    for (int i = 0; i < timings.size(); ++i)
    {
      // Get min/max; ignore the first result
      auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
@ -305,118 +359,11 @@ void run()
          << std::endl;
      }
    }
-
+  } else if (selection == Benchmark::Triad)
  delete stream;
 }
 template <typename T>
 void run_triad()
 {
  if (!output_as_csv)
  {
    std::cout << "Running triad " << num_times << " times" << std::endl;
    std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
    if (sizeof(T) == sizeof(float))
      std::cout << "Precision: float" << std::endl;
    else
      std::cout << "Precision: double" << std::endl;
    std::streamsize ss = std::cout.precision();
    if (mibibytes)
    {
      std::cout << std::setprecision(1) << std::fixed
        << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
        << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
        << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
    }
    else
    {
      std::cout << std::setprecision(1) << std::fixed
        << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
        << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
        << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
    }
    std::cout.precision(ss);
  }
  Stream<T> *stream;
 #if defined(CUDA)
  // Use the CUDA implementation
  stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(HIP)
  // Use the HIP implementation
  stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(OCL)
  // Use the OpenCL implementation
  stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(USE_RAJA)
  // Use the RAJA implementation
  stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(KOKKOS)
  // Use the Kokkos implementation
  stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(STD)
  // Use the STD implementation
  stream = new STDStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(STD20)
  // Use the C++20 implementation
  stream = new STD20Stream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(SYCL)
  // Use the SYCL implementation
  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(OMP)
  // Use the OpenMP implementation
  stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
 #endif
  stream->init_arrays(startA, startB, startC);
  // Declare timers
  std::chrono::high_resolution_clock::time_point t1, t2;
  // Run triad in loop
  t1 = std::chrono::high_resolution_clock::now();
  for (unsigned int k = 0; k < num_times; k++)
  {
    stream->triad();
  }
  t2 = std::chrono::high_resolution_clock::now();
  double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
  // Check solutions
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE);
  std::vector<T> b(ARRAY_SIZE);
  std::vector<T> c(ARRAY_SIZE);
  T sum = 0.0;
  stream->read_arrays(a, b, c);
  check_solution<T>(num_times, a, b, c, sum);
    // Display timing results
    double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
-  double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / runtime);
+    double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
    if (output_as_csv)
    {
@ -434,7 +381,7 @@ void run_triad()
        << ARRAY_SIZE << csv_separator
        << sizeof(T) << csv_separator
        << bandwidth << csv_separator
-      << runtime
+        << timings[0][0]
        << std::endl;
    }
    else
@ -443,15 +390,18 @@ void run_triad()
        << "--------------------------------"
        << std::endl << std::fixed
        << "Runtime (seconds): " << std::left << std::setprecision(5)
-      << runtime << std::endl
+        << timings[0][0] << std::endl
        << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "):  "
        << std::left << std::setprecision(3)
        << bandwidth << std::endl;
    }
  }
  delete stream;
 }
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
 {
@ -466,7 +416,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
  for (unsigned int i = 0; i < ntimes; i++)
  {
    // Do STREAM!
-    if (!triad_only)
+    if (! (selection == Benchmark::Triad))
    {
      goldC = goldA;
      goldB = scalar * goldC;
@ -502,7 +452,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
      << "Validation failed on c[]. Average error " << errC
      << std::endl;
  // Check sum to 8 decimal places
-  if (!triad_only && errSum > 1.0E-8)
+  if (!(selection == Benchmark::Triad) && errSum > 1.0E-8)
    std::cerr
      << "Validation failed on sum. Error " << errSum
      << std::endl << std::setprecision(15)
@ -571,7 +521,7 @@ void parseArguments(int argc, char *argv[])
    }
    else if (!std::string("--triad-only").compare(argv[i]))
    {
-      triad_only = true;
+      selection = Benchmark::Triad;
    }
    else if (!std::string("--csv").compare(argv[i]))
    {