From fdeb20601fc33d76383b2f833cb902f80225d025 Mon Sep 17 00:00:00 2001
From: sunway513 <peng.sun@amd.com>
Date: Mon, 14 Mar 2016 11:44:30 -0500
Subject: [PATCH] Pull request for HIP version

---
 Makefile                            |  18 +-
 README.md                           |   7 +
 hip-stream.cpp                      | 398 ++++++++++++++++++++++++++++
 results/cuda/nvidia-gtx-titan_x.txt |  15 ++
 results/hip/amd-fiji-nano.txt       |  15 ++
 results/hip/nvidia-gtx-titan_x.txt  |  22 ++
 6 files changed, 473 insertions(+), 2 deletions(-)
 create mode 100644 hip-stream.cpp
 create mode 100644 results/cuda/nvidia-gtx-titan_x.txt
 create mode 100644 results/hip/amd-fiji-nano.txt
 create mode 100644 results/hip/nvidia-gtx-titan_x.txt

diff --git a/Makefile b/Makefile
index 4fb5f7a..c2bc9b2 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,8 @@ ifeq ($(PLATFORM), Darwin)
 	LDLIBS = -framework OpenCL
 endif
 
-all: gpu-stream-ocl gpu-stream-cuda
+all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip
+
 
 gpu-stream-ocl: ocl-stream.cpp common.o Makefile
 	$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
@@ -19,9 +20,22 @@ ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
 else
 	$(error "Cannot find nvcc, please install CUDA toolkit")
 endif
+HIP_PATH?=../../..
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+hip-stream.o : hip-stream.cpp
+	$(HIPCC) $(CXXFLAGS) -c $< -o $@
+
+gpu-stream-hip: hip-stream.o common.o Makefile
+ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
+	$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
+else
+	$(error "Cannot find $(HIPCC), please install HIP toolkit")
+endif
+
 
 .PHONY: clean
 
 clean:
-	rm -f gpu-stream-ocl gpu-stream-cuda *.o
+	rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o
 
diff --git a/README.md b/README.md
index 61907e1..a2f6ab6 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,13 @@ Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6
 
 Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
 
+For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
+http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
+Clone from the HIP repository in the following link:
+https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
+
+Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
+
 Android
 -------
 
diff --git a/hip-stream.cpp b/hip-stream.cpp
new file mode 100644
index 0000000..ddbcae4
--- /dev/null
+++ b/hip-stream.cpp
@@ -0,0 +1,398 @@
+#include "hip_runtime.h"
+/*=============================================================================
+*------------------------------------------------------------------------------
+* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Based on John D. McCalpin’s original STREAM benchmark for CPUs
+*------------------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "GPU-STREAM benchmark results",
+*         published results must be in conformance to the GPU-STREAM
+*         Run Rules published at
+*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         and incorporated herein by reference.
+*         The copyright holders retain the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the GPU-STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned GPU-STREAM benchmark results"
+*         "based on a variant of the GPU-STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the GPU-STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*———————————————————————————————————-----------------------------------------*/
+
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <chrono>
+#include <cfloat>
+#include <cmath>
+
+#include <cuda.h>
+#include "common.h"
+
+std::string getDeviceName(int device);
+int getDriver(void);
+
+// Code to check CUDA errors
+void check_cuda_error(void)
+{
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess)
+    {
+        std::cerr
+            << "Error: "
+            << hipGetErrorString(err)
+            << std::endl;
+            exit(err);
+    }
+}
+
+template <typename T>
+__global__ void copy(hipLaunchParm lp, const T * a, T * c)
+{
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    c[i] = a[i];
+}
+
+template <typename T>
+__global__ void mul(hipLaunchParm lp, T * b, const T * c)
+{
+    const T scalar = 3.0;
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    b[i] = scalar * c[i];
+}
+
+template <typename T>
+__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
+{
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    c[i] = a[i] + b[i];
+}
+
+template <typename T>
+__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
+{
+    const T scalar = 3.0;
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    a[i] = b[i] + scalar * c[i];
+}
+
+int main(int argc, char *argv[])
+{
+
+    // Print out run information
+    std::cout
+        << "GPU-STREAM" << std::endl
+        << "Version: " << VERSION_STRING << std::endl
+        << "Implementation: CUDA" << std::endl;
+
+    parseArguments(argc, argv);
+
+    if (NTIMES < 2)
+        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
+
+    std::cout << "Precision: ";
+    if (useFloat) std::cout << "float";
+    else std::cout << "double";
+    std::cout << std::endl << std::endl;
+
+    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
+
+    if (ARRAY_SIZE % 1024 != 0)
+    {
+        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
+        ARRAY_SIZE -= ARRAY_SIZE % 1024;
+        std::cout
+            << "Warning: array size must divide 1024" << std::endl
+            << "Resizing array from " << OLD_ARRAY_SIZE
+            << " to " << ARRAY_SIZE << std::endl;
+        if (ARRAY_SIZE == 0)
+            throw std::runtime_error("Array size must be >= 1024");
+    }
+
+    // Get precision (used to reset later)
+    std::streamsize ss = std::cout.precision();
+
+    size_t DATATYPE_SIZE;
+
+    if (useFloat)
+    {
+        DATATYPE_SIZE = sizeof(float);
+    }
+    else
+    {
+        DATATYPE_SIZE = sizeof(double);
+    }
+
+    // Display number of bytes in array
+    std::cout << std::setprecision(1) << std::fixed
+        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
+        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+        << std::endl;
+    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
+        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+        << std::endl;
+
+    // Reset precision
+    std::cout.precision(ss);
+
+    // Check device index is in range
+    int count;
+    hipGetDeviceCount(&count);
+    check_cuda_error();
+    if (deviceIndex >= count)
+        throw std::runtime_error("Chosen device index is invalid");
+    hipSetDevice(deviceIndex);
+    check_cuda_error();
+
+    // Print out device name
+    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
+
+    // Print out device CUDA driver version
+    std::cout << "Driver: " << getDriver() << std::endl;
+
+    // Check buffers fit on the device
+    hipDeviceProp_t props;
+    hipGetDeviceProperties(&props, deviceIndex);
+    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
+        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+    // Create host vectors
+    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+
+    // Initilise arrays
+    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
+    {
+        if (useFloat)
+        {
+            ((float*)h_a)[i] = 1.0f;
+            ((float*)h_b)[i] = 2.0f;
+            ((float*)h_c)[i] = 0.0f;
+        }
+        else
+        {
+            ((double*)h_a)[i] = 1.0;
+            ((double*)h_b)[i] = 2.0;
+            ((double*)h_c)[i] = 0.0;
+        }
+    }
+
+    // Create device buffers
+    void * d_a, * d_b, *d_c;
+    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+
+    // Copy host memory to device
+    hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+    hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+    hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+
+    // Make sure the copies are finished
+    hipDeviceSynchronize();
+    check_cuda_error();
+
+    // List of times
+    std::vector< std::vector<double> > timings;
+
+    // Declare timers
+    std::chrono::high_resolution_clock::time_point t1, t2;
+
+    // Main loop
+    for (unsigned int k = 0; k < NTIMES; k++)
+    {
+        std::vector<double> times;
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+        timings.push_back(times);
+
+    }
+
+    // Check solutions
+    hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+    hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+    hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+
+    if (useFloat)
+    {
+        check_solution<float>(h_a, h_b, h_c);
+    }
+    else
+    {
+        check_solution<double>(h_a, h_b, h_c);
+    }
+
+    // Crunch results
+    size_t sizes[4] = {
+        2 * DATATYPE_SIZE * ARRAY_SIZE,
+        2 * DATATYPE_SIZE * ARRAY_SIZE,
+        3 * DATATYPE_SIZE * ARRAY_SIZE,
+        3 * DATATYPE_SIZE * ARRAY_SIZE
+    };
+    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
+    double max[4] = {0.0, 0.0, 0.0, 0.0};
+    double avg[4] = {0.0, 0.0, 0.0, 0.0};
+
+    // Ignore first result
+    for (unsigned int i = 1; i < NTIMES; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            avg[j] += timings[i][j];
+            min[j] = std::min(min[j], timings[i][j]);
+            max[j] = std::max(max[j], timings[i][j]);
+        }
+    }
+
+    for (int j = 0; j < 4; j++)
+        avg[j] /= (double)(NTIMES-1);
+
+    // Display results
+    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
+    std::cout
+        << std::left << std::setw(12) << "Function"
+        << std::left << std::setw(12) << "MBytes/sec"
+        << std::left << std::setw(12) << "Min (sec)"
+        << std::left << std::setw(12) << "Max"
+        << std::left << std::setw(12) << "Average"
+        << std::endl;
+
+    for (int j = 0; j < 4; j++)
+    {
+        std::cout
+            << std::left << std::setw(12) << labels[j]
+            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
+            << std::left << std::setw(12) << std::setprecision(5) << min[j]
+            << std::left << std::setw(12) << std::setprecision(5) << max[j]
+            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
+            << std::endl;
+    }
+
+    // Free host vectors
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    // Free cuda buffers
+    hipFree(d_a);
+    check_cuda_error();
+    hipFree(d_b);
+    check_cuda_error();
+    hipFree(d_c);
+    check_cuda_error();
+
+}
+
+std::string getDeviceName(int device)
+{
+    struct hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, device);
+    check_cuda_error();
+    return std::string(prop.name);
+}
+
+int getDriver(void)
+{
+    int driver;
+    hipDriverGetVersion(&driver);
+    check_cuda_error();
+    return driver;
+}
+
+void listDevices(void)
+{
+    // Get number of devices
+    int count;
+    hipGetDeviceCount(&count);
+    check_cuda_error();
+
+    // Print device names
+    if (count == 0)
+    {
+        std::cout << "No devices found." << std::endl;
+    }
+    else
+    {
+        std::cout << std::endl;
+        std::cout << "Devices:" << std::endl;
+        for (int i = 0; i < count; i++)
+        {
+            std::cout << i << ": " << getDeviceName(i) << std::endl;
+            check_cuda_error();
+        }
+        std::cout << std::endl;
+    }
+}
+
diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..2d3b004
--- /dev/null
+++ b/results/cuda/nvidia-gtx-titan_x.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device GeForce GTX TITAN X
+Driver: 7050
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263155.587  0.00319     0.00319     0.00319     
+Mul         262943.430  0.00319     0.00319     0.00319     
+Add         268710.444  0.00468     0.00469     0.00469     
+Triad       268957.305  0.00468     0.00469     0.00468    
diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
new file mode 100644
index 0000000..df40ac8
--- /dev/null
+++ b/results/hip/amd-fiji-nano.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device Fiji
+Driver: 4
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        375822.410  0.00223     0.00225     0.00224     
+Mul         375086.879  0.00224     0.00227     0.00224     
+Add         425650.718  0.00296     0.00298     0.00297     
+Triad       424710.113  0.00296     0.00298     0.00298
diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..6bb94a3
--- /dev/null
+++ b/results/hip/nvidia-gtx-titan_x.txt
@@ -0,0 +1,22 @@
+GPU-STREAM
+Version: 1.0
+Implementation: HIP
+GridSize: 52428800 work-items
+GroupSize: 1024 work-items
+Operations/Work-item: 1
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB) 0 bytes padding
+Total size: 1200.0 MB (=1.2 GB)
+Using HIP device GeForce GTX TITAN X (compute_units=24)
+Driver: 4
+d_a=0x1306d80000
+d_b=0x131fd80000
+d_c=0x1338d80000
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263042.207  0.00319     0.00320     0.00319     
+Mul         262972.033  0.00319     0.00320     0.00319     
+Add         268732.653  0.00468     0.00469     0.00469     
+Triad       268706.197  0.00468     0.00469     0.00469     
+GEOMEAN     265847.929