From 7621f867017e1c4e4a503af56744fb186698f360 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 3 Jan 2017 11:43:12 +0100
Subject: [PATCH] added pure HC gpu stream implmentation

---
 CMakeLists.txt |  29 +++++++
 HCStream.cpp   | 213 +++++++++++++++++++++++++++++++++++++++++++++++++
 HCStream.h     |  44 ++++++++++
 main.cpp       |   6 ++
 4 files changed, 292 insertions(+)
 create mode 100644 HCStream.cpp
 create mode 100644 HCStream.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index efee733..5551c54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,15 @@ if(NOT DEFINED HIP_PATH)
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
+if(NOT DEFINED HCC_PATH)
+    if(NOT DEFINED ENV{HCC_PATH})
+        set(HCC_PATH "/opt/rocm/" CACHE PATH "Path to which HCC has been installed")
+    else()
+        set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed")
+    endif()
+endif()
+
+
 
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -49,6 +58,26 @@ else()
   message("Skipping HIP...")
 endif()
 
+#-------------------------------------------------------------------------------
+# HCC
+#-------------------------------------------------------------------------------
+find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin)
+if(EXISTS ${HCC_BINARY})
+  #can the following be tied to the target only?
+  set(CMAKE_CXX_COMPILER ${HCC_BINARY})
+  # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include)
+  # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive  -mcmodel=small )
+  add_executable(gpu-stream-hc main.cpp HCStream.cpp)
+  set_target_properties(gpu-stream-hc PROPERTIES
+    COMPILE_FLAGS "-hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include"
+    LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive  -mcmodel=small"
+    )
+  target_compile_definitions(gpu-stream-hc PUBLIC HC)
+else()
+  message("Skipping HC...")
+endif()
+
+
 #-------------------------------------------------------------------------------
 # CUDA
 #-------------------------------------------------------------------------------
diff --git a/HCStream.cpp b/HCStream.cpp
new file mode 100644
index 0000000..0c55af3
--- /dev/null
+++ b/HCStream.cpp
@@ -0,0 +1,213 @@
+// Copyright (c) 2015-16 Peter Steinbach, MPI CBG Scientific Computing Facility
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+
+#include <codecvt>
+#include <vector>
+#include <locale>
+
+
+#include "HCStream.h"
+//#include "hc.hpp"
+
+#define TBSIZE 1024
+
+std::string getDeviceName(const hc::accelerator& _acc)
+{
+  std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+  std::string value = converter.to_bytes(_acc.get_description());
+  return value;
+}
+
+void listDevices(void)
+{
+  // Get number of devices
+  std::vector<hc::accelerator> accs = hc::accelerator::get_all();
+  
+  // Print device names
+  if (accs.empty())
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < accs.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(accs[i]) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+// void check_error(void)
+// {
+//   hipError_t err = hipGetLastError();
+//   if (err != hipSuccess)
+//   {
+//     std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
+//     exit(err);
+//   }
+// }
+
+template <class T>
+HCStream<T>::HCStream(const unsigned int ARRAY_SIZE, const int device_index):
+  array_size(ARRAY_SIZE),
+  d_a(ARRAY_SIZE),
+  d_b(ARRAY_SIZE),
+  d_c(ARRAY_SIZE)
+{
+
+  // The array size must be divisible by TBSIZE for kernel launches
+  if (ARRAY_SIZE % TBSIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << TBSIZE;
+    throw std::runtime_error(ss.str());
+  }
+
+  // // Set device
+  std::vector<hc::accelerator> accs = hc::accelerator::get_all();
+  auto current = accs[device_index];
+  
+  std::cout << "Using HC device " << getDeviceName(current) << std::endl;
+  
+  // // The array size must be divisible by TBSIZE for kernel launches
+  // if (ARRAY_SIZE % TBSIZE != 0)
+  // {
+  //   std::stringstream ss;
+  //   ss << "Array size must be a multiple of " << TBSIZE;
+  //   throw std::runtime_error(ss.str());
+  // }
+
+  // // Set device
+  // int count;
+  // hipGetDeviceCount(&count);
+  // check_error();
+  // if (device_index >= count)
+  //   throw std::runtime_error("Invalid device index");
+  // hipSetDevice(device_index);
+  // check_error();
+
+  // // Print out device information
+  // std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
+  // std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
+  // array_size = ARRAY_SIZE;
+
+  // // Check buffers fit on the device
+  // hipDeviceProp_t props;
+  // hipGetDeviceProperties(&props, 0);
+  // if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
+  //   throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+  // // Create device buffers
+  // hipMalloc(&d_a, ARRAY_SIZE*sizeof(T));
+  // check_error();
+  // hipMalloc(&d_b, ARRAY_SIZE*sizeof(T));
+  // check_error();
+  // hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
+  // check_error();
+
+  
+}
+
+
+template <class T>
+HCStream<T>::~HCStream()
+{
+}
+
+template <class T>
+void HCStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  hc::copy(a.cbegin(),a.cend(),d_a);
+  hc::copy(b.cbegin(),b.cend(),d_b);
+  hc::copy(c.cbegin(),c.cend(),d_c);
+}
+
+template <class T>
+void HCStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  // Copy device memory to host
+  hc::copy(d_a,a.begin());
+  hc::copy(d_b,b.begin());
+  hc::copy(d_c,c.begin());
+}
+
+
+template <class T>
+void HCStream<T>::copy()
+{
+  try{
+  // launch a GPU kernel to compute the saxpy in parallel 
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+								, [&](hc::index<1> i) [[hc]] {
+								  d_c[i] = d_a[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cout << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::mul()
+{
+  const T scalar = 0.3;
+  try{
+  // launch a GPU kernel to compute the saxpy in parallel 
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+								, [&](hc::index<1> i) [[hc]] {
+								  d_b[i] = scalar*d_c[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cout << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::add()
+{
+  try{
+    // launch a GPU kernel to compute the saxpy in parallel 
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+								, [&](hc::index<1> i) [[hc]] {
+								  d_c[i] = d_a[i]+d_b[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cout << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::triad()
+{
+  const T scalar = 0.3;
+  try{
+    // launch a GPU kernel to compute the saxpy in parallel 
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+								, [&](hc::index<1> i) [[hc]] {
+								  d_a[i] = d_b[i] + scalar*d_c[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cout << e.what() << std::endl;
+    throw;
+  }
+}
+
+template class HCStream<float>;
+template class HCStream<double>;
diff --git a/HCStream.h b/HCStream.h
new file mode 100644
index 0000000..a625be0
--- /dev/null
+++ b/HCStream.h
@@ -0,0 +1,44 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include <sstream>
+
+#include "Stream.h"
+#include "hc.hpp"
+
+#define IMPLEMENTATION_STRING "HC"
+
+template <class T>
+class HCStream : public Stream<T>
+{
+protected:
+  // Size of arrays
+  unsigned int array_size;
+  // Device side pointers to arrays
+  hc::array<T,1> d_a;
+  hc::array<T,1> d_b;
+  hc::array<T,1> d_c;
+
+
+public:
+
+  HCStream(const unsigned int, const int);
+  ~HCStream();
+
+  virtual void copy() override;
+  virtual void add() override;
+  virtual void mul() override;
+  virtual void triad() override;
+
+  virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+  virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
diff --git a/main.cpp b/main.cpp
index 6a15aa7..50ab648 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,6 +22,8 @@
 #include "CUDAStream.h"
 #elif defined(HIP)
 #include "HIPStream.h"
+#elif defined(HC)
+#include "HCStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
 #elif defined(USE_RAJA)
@@ -105,6 +107,10 @@ void run()
   // Use the HIP implementation
   stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(HC)
+  // Use the HC implementation
+  stream = new HCStream<T>(ARRAY_SIZE, deviceIndex);
+
 #elif defined(OCL)
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);