From fdeb20601fc33d76383b2f833cb902f80225d025 Mon Sep 17 00:00:00 2001
From: sunway513 <peng.sun@amd.com>
Date: Mon, 14 Mar 2016 11:44:30 -0500
Subject: [PATCH 01/10] Pull request for HIP version

---
 Makefile                            |  18 +-
 README.md                           |   7 +
 hip-stream.cpp                      | 398 ++++++++++++++++++++++++++++
 results/cuda/nvidia-gtx-titan_x.txt |  15 ++
 results/hip/amd-fiji-nano.txt       |  15 ++
 results/hip/nvidia-gtx-titan_x.txt  |  22 ++
 6 files changed, 473 insertions(+), 2 deletions(-)
 create mode 100644 hip-stream.cpp
 create mode 100644 results/cuda/nvidia-gtx-titan_x.txt
 create mode 100644 results/hip/amd-fiji-nano.txt
 create mode 100644 results/hip/nvidia-gtx-titan_x.txt

diff --git a/Makefile b/Makefile
index 4fb5f7a..c2bc9b2 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,8 @@ ifeq ($(PLATFORM), Darwin)
 	LDLIBS = -framework OpenCL
 endif
 
-all: gpu-stream-ocl gpu-stream-cuda
+all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip
+
 
 gpu-stream-ocl: ocl-stream.cpp common.o Makefile
 	$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
@@ -19,9 +20,22 @@ ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
 else
 	$(error "Cannot find nvcc, please install CUDA toolkit")
 endif
+HIP_PATH?=../../..
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+hip-stream.o : hip-stream.cpp
+	$(HIPCC) $(CXXFLAGS) -c $< -o $@
+
+gpu-stream-hip: hip-stream.o common.o Makefile
+ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
+	$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
+else
+	$(error "Cannot find $(HIPCC), please install HIP toolkit")
+endif
+
 
 .PHONY: clean
 
 clean:
-	rm -f gpu-stream-ocl gpu-stream-cuda *.o
+	rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o
 
diff --git a/README.md b/README.md
index 61907e1..a2f6ab6 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,13 @@ Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6
 
 Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
 
+For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
+http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
+Clone from the HIP repository in the following link:
+https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
+
+Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
+
 Android
 -------
 
diff --git a/hip-stream.cpp b/hip-stream.cpp
new file mode 100644
index 0000000..ddbcae4
--- /dev/null
+++ b/hip-stream.cpp
@@ -0,0 +1,398 @@
+#include "hip_runtime.h"
+/*=============================================================================
+*------------------------------------------------------------------------------
+* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Based on John D. McCalpin’s original STREAM benchmark for CPUs
+*------------------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "GPU-STREAM benchmark results",
+*         published results must be in conformance to the GPU-STREAM
+*         Run Rules published at
+*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         and incorporated herein by reference.
+*         The copyright holders retain the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the GPU-STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned GPU-STREAM benchmark results"
+*         "based on a variant of the GPU-STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the GPU-STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*———————————————————————————————————-----------------------------------------*/
+
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <chrono>
+#include <cfloat>
+#include <cmath>
+
+#include <cuda.h>
+#include "common.h"
+
+std::string getDeviceName(int device);
+int getDriver(void);
+
+// Code to check CUDA errors
+void check_cuda_error(void)
+{
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess)
+    {
+        std::cerr
+            << "Error: "
+            << hipGetErrorString(err)
+            << std::endl;
+            exit(err);
+    }
+}
+
+template <typename T>
+__global__ void copy(hipLaunchParm lp, const T * a, T * c)
+{
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    c[i] = a[i];
+}
+
+template <typename T>
+__global__ void mul(hipLaunchParm lp, T * b, const T * c)
+{
+    const T scalar = 3.0;
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    b[i] = scalar * c[i];
+}
+
+template <typename T>
+__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
+{
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    c[i] = a[i] + b[i];
+}
+
+template <typename T>
+__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
+{
+    const T scalar = 3.0;
+    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    a[i] = b[i] + scalar * c[i];
+}
+
+int main(int argc, char *argv[])
+{
+
+    // Print out run information
+    std::cout
+        << "GPU-STREAM" << std::endl
+        << "Version: " << VERSION_STRING << std::endl
+        << "Implementation: CUDA" << std::endl;
+
+    parseArguments(argc, argv);
+
+    if (NTIMES < 2)
+        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
+
+    std::cout << "Precision: ";
+    if (useFloat) std::cout << "float";
+    else std::cout << "double";
+    std::cout << std::endl << std::endl;
+
+    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
+
+    if (ARRAY_SIZE % 1024 != 0)
+    {
+        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
+        ARRAY_SIZE -= ARRAY_SIZE % 1024;
+        std::cout
+            << "Warning: array size must divide 1024" << std::endl
+            << "Resizing array from " << OLD_ARRAY_SIZE
+            << " to " << ARRAY_SIZE << std::endl;
+        if (ARRAY_SIZE == 0)
+            throw std::runtime_error("Array size must be >= 1024");
+    }
+
+    // Get precision (used to reset later)
+    std::streamsize ss = std::cout.precision();
+
+    size_t DATATYPE_SIZE;
+
+    if (useFloat)
+    {
+        DATATYPE_SIZE = sizeof(float);
+    }
+    else
+    {
+        DATATYPE_SIZE = sizeof(double);
+    }
+
+    // Display number of bytes in array
+    std::cout << std::setprecision(1) << std::fixed
+        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
+        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+        << std::endl;
+    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
+        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+        << std::endl;
+
+    // Reset precision
+    std::cout.precision(ss);
+
+    // Check device index is in range
+    int count;
+    hipGetDeviceCount(&count);
+    check_cuda_error();
+    if (deviceIndex >= count)
+        throw std::runtime_error("Chosen device index is invalid");
+    hipSetDevice(deviceIndex);
+    check_cuda_error();
+
+    // Print out device name
+    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
+
+    // Print out device CUDA driver version
+    std::cout << "Driver: " << getDriver() << std::endl;
+
+    // Check buffers fit on the device
+    hipDeviceProp_t props;
+    hipGetDeviceProperties(&props, deviceIndex);
+    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
+        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+    // Create host vectors
+    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+
+    // Initilise arrays
+    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
+    {
+        if (useFloat)
+        {
+            ((float*)h_a)[i] = 1.0f;
+            ((float*)h_b)[i] = 2.0f;
+            ((float*)h_c)[i] = 0.0f;
+        }
+        else
+        {
+            ((double*)h_a)[i] = 1.0;
+            ((double*)h_b)[i] = 2.0;
+            ((double*)h_c)[i] = 0.0;
+        }
+    }
+
+    // Create device buffers
+    void * d_a, * d_b, *d_c;
+    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
+    check_cuda_error();
+
+    // Copy host memory to device
+    hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+    hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+    hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
+    check_cuda_error();
+
+    // Make sure the copies are finished
+    hipDeviceSynchronize();
+    check_cuda_error();
+
+    // List of times
+    std::vector< std::vector<double> > timings;
+
+    // Declare timers
+    std::chrono::high_resolution_clock::time_point t1, t2;
+
+    // Main loop
+    for (unsigned int k = 0; k < NTIMES; k++)
+    {
+        std::vector<double> times;
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+
+        t1 = std::chrono::high_resolution_clock::now();
+        if (useFloat)
+            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+        else
+            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        check_cuda_error();
+        hipDeviceSynchronize();
+        check_cuda_error();
+        t2 = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+        timings.push_back(times);
+
+    }
+
+    // Check solutions
+    hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+    hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+    hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
+    check_cuda_error();
+
+    if (useFloat)
+    {
+        check_solution<float>(h_a, h_b, h_c);
+    }
+    else
+    {
+        check_solution<double>(h_a, h_b, h_c);
+    }
+
+    // Crunch results
+    size_t sizes[4] = {
+        2 * DATATYPE_SIZE * ARRAY_SIZE,
+        2 * DATATYPE_SIZE * ARRAY_SIZE,
+        3 * DATATYPE_SIZE * ARRAY_SIZE,
+        3 * DATATYPE_SIZE * ARRAY_SIZE
+    };
+    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
+    double max[4] = {0.0, 0.0, 0.0, 0.0};
+    double avg[4] = {0.0, 0.0, 0.0, 0.0};
+
+    // Ignore first result
+    for (unsigned int i = 1; i < NTIMES; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            avg[j] += timings[i][j];
+            min[j] = std::min(min[j], timings[i][j]);
+            max[j] = std::max(max[j], timings[i][j]);
+        }
+    }
+
+    for (int j = 0; j < 4; j++)
+        avg[j] /= (double)(NTIMES-1);
+
+    // Display results
+    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
+    std::cout
+        << std::left << std::setw(12) << "Function"
+        << std::left << std::setw(12) << "MBytes/sec"
+        << std::left << std::setw(12) << "Min (sec)"
+        << std::left << std::setw(12) << "Max"
+        << std::left << std::setw(12) << "Average"
+        << std::endl;
+
+    for (int j = 0; j < 4; j++)
+    {
+        std::cout
+            << std::left << std::setw(12) << labels[j]
+            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
+            << std::left << std::setw(12) << std::setprecision(5) << min[j]
+            << std::left << std::setw(12) << std::setprecision(5) << max[j]
+            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
+            << std::endl;
+    }
+
+    // Free host vectors
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    // Free cuda buffers
+    hipFree(d_a);
+    check_cuda_error();
+    hipFree(d_b);
+    check_cuda_error();
+    hipFree(d_c);
+    check_cuda_error();
+
+}
+
+std::string getDeviceName(int device)
+{
+    struct hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, device);
+    check_cuda_error();
+    return std::string(prop.name);
+}
+
+int getDriver(void)
+{
+    int driver;
+    hipDriverGetVersion(&driver);
+    check_cuda_error();
+    return driver;
+}
+
+void listDevices(void)
+{
+    // Get number of devices
+    int count;
+    hipGetDeviceCount(&count);
+    check_cuda_error();
+
+    // Print device names
+    if (count == 0)
+    {
+        std::cout << "No devices found." << std::endl;
+    }
+    else
+    {
+        std::cout << std::endl;
+        std::cout << "Devices:" << std::endl;
+        for (int i = 0; i < count; i++)
+        {
+            std::cout << i << ": " << getDeviceName(i) << std::endl;
+            check_cuda_error();
+        }
+        std::cout << std::endl;
+    }
+}
+
diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..2d3b004
--- /dev/null
+++ b/results/cuda/nvidia-gtx-titan_x.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device GeForce GTX TITAN X
+Driver: 7050
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263155.587  0.00319     0.00319     0.00319     
+Mul         262943.430  0.00319     0.00319     0.00319     
+Add         268710.444  0.00468     0.00469     0.00469     
+Triad       268957.305  0.00468     0.00469     0.00468    
diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
new file mode 100644
index 0000000..df40ac8
--- /dev/null
+++ b/results/hip/amd-fiji-nano.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device Fiji
+Driver: 4
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        375822.410  0.00223     0.00225     0.00224     
+Mul         375086.879  0.00224     0.00227     0.00224     
+Add         425650.718  0.00296     0.00298     0.00297     
+Triad       424710.113  0.00296     0.00298     0.00298
diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..6bb94a3
--- /dev/null
+++ b/results/hip/nvidia-gtx-titan_x.txt
@@ -0,0 +1,22 @@
+GPU-STREAM
+Version: 1.0
+Implementation: HIP
+GridSize: 52428800 work-items
+GroupSize: 1024 work-items
+Operations/Work-item: 1
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB) 0 bytes padding
+Total size: 1200.0 MB (=1.2 GB)
+Using HIP device GeForce GTX TITAN X (compute_units=24)
+Driver: 4
+d_a=0x1306d80000
+d_b=0x131fd80000
+d_c=0x1338d80000
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263042.207  0.00319     0.00320     0.00319     
+Mul         262972.033  0.00319     0.00320     0.00319     
+Add         268732.653  0.00468     0.00469     0.00469     
+Triad       268706.197  0.00468     0.00469     0.00469     
+GEOMEAN     265847.929 

From 11053798ff7dca2bd8eb7a8e5bacac10758b5159 Mon Sep 17 00:00:00 2001
From: sunway513 <peng.sun@amd.com>
Date: Tue, 15 Mar 2016 07:56:32 -0500
Subject: [PATCH 02/10] Improved GPU-STREAM benchmark for HIP version: 1. Add
 optional looper kernels to take command line input for the number of groups
 and groupSize 2. Add GEOMEAN value calculation of the kernels 3. Instructions
 on configure HIP environment in the README.md 4. Add results for HIP on FIJI
 Nano, TITAN X; CUDA on TITAN X 5. Run script to optionally run HIP version
 with groups and groupSize options

---
 README.md      |   4 +-
 common.cpp     |  50 ++++++++++++
 common.h       |   3 +
 hip-stream.cpp | 213 +++++++++++++++++++++++++++++++++++++++----------
 runhip.sh      |   4 +
 5 files changed, 233 insertions(+), 41 deletions(-)
 create mode 100755 runhip.sh

diff --git a/README.md b/README.md
index a2f6ab6..c776347 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-
 
 For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
 http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
-Clone from the HIP repository in the following link:
+Install the HCC compiler:
+https://bitbucket.org/multicoreware/hcc/wiki/Home
+Install HIP:
 https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
 
 Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
diff --git a/common.cpp b/common.cpp
index 781d70e..122376f 100644
--- a/common.cpp
+++ b/common.cpp
@@ -39,10 +39,13 @@
 // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
 // Use binary powers of two so divides 1024
 unsigned int ARRAY_SIZE = 52428800;
+size_t   ARRAY_PAD_BYTES  = 0;
 
 unsigned int NTIMES = 10;
 
 bool useFloat = false;
+unsigned int  groups   = 0;
+unsigned int  groupSize   = 1024;
 
 unsigned int deviceIndex = 0;
 
@@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output)
     return !strlen(next);
 }
 
+int parseSize(const char *str, size_t *output)
+{
+    char *next;
+    *output = strtoull(str, &next, 0);
+	int l = strlen(str);
+	if (l) {
+		char c = str[l-1]; // last char.
+		if ((c == 'k') || (c == 'K')) {
+			*output *= 1024;
+		}
+		if ((c == 'm') || (c == 'M')) {
+			*output *= (1024*1024);
+		}
+
+	}
+    return !strlen(next);
+}
+
+
 void parseArguments(int argc, char *argv[])
 {
     for (int i = 1; i < argc; i++)
@@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[])
                 exit(1);
             }
         }
+        else if (!strcmp(argv[i], "--groups"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groups))
+            {
+                std::cout << "Invalid group number" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--groupSize"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groupSize))
+            {
+                std::cout << "Invalid group size" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--pad"))
+        {
+            if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
+            {
+                std::cout << "Invalid size" << std::endl;
+                exit(1);
+            }
+			
+        }
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
@@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[])
             std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
             std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
             std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
+            std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
+            std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
+            std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
             std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
             std::cout << std::endl;
             exit(0);
diff --git a/common.h b/common.h
index a4dd886..9cf61d7 100644
--- a/common.h
+++ b/common.h
@@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]);
 extern void listDevices(void);
 
 extern unsigned int ARRAY_SIZE;
+extern size_t       ARRAY_PAD_BYTES;
 extern unsigned int NTIMES;
 
+extern unsigned int  groups;
+extern unsigned int  groupSize;
 extern bool useFloat;
 
 extern unsigned int deviceIndex;
diff --git a/hip-stream.cpp b/hip-stream.cpp
index ddbcae4..6984b44 100644
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@@ -43,7 +43,7 @@
 #include <cfloat>
 #include <cmath>
 
-#include <cuda.h>
+//#include <cuda.h>
 #include "common.h"
 
 std::string getDeviceName(int device);
@@ -63,15 +63,75 @@ void check_cuda_error(void)
     }
 }
 
+
+
+// looper function place more work inside each work item.
+// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
+template <typename T, int CLUMP_SIZE>
+__global__ void
+copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
+{
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
+    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i];
+    }
+}
+
 template <typename T>
-__global__ void copy(hipLaunchParm lp, const T * a, T * c)
+__global__ void
+mul_looper(hipLaunchParm lp,  T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        b[i] = scalar * c[i];
+    }
+}
+
+template <typename T>
+__global__ void
+add_looper(hipLaunchParm lp,  const T * a, const T * b, T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template <typename T>
+__global__ void
+triad_looper(hipLaunchParm lp,  T * a, const T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        a[i] = b[i] + scalar * c[i];
+    }
+}
+
+
+
+
+template <typename T>
+__global__ void
+copy(hipLaunchParm lp,  const T * a, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i];
 }
 
+
 template <typename T>
-__global__ void mul(hipLaunchParm lp, T * b, const T * c)
+__global__ void
+mul(hipLaunchParm lp,  T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c)
 }
 
 template <typename T>
-__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
+__global__ void
+add(hipLaunchParm lp,  const T * a, const T * b, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i] + b[i];
 }
 
 template <typename T>
-__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
+__global__ void
+triad(hipLaunchParm lp,  T * a, const T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -100,13 +162,27 @@ int main(int argc, char *argv[])
     std::cout
         << "GPU-STREAM" << std::endl
         << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: CUDA" << std::endl;
+        << "Implementation: HIP" << std::endl;
 
     parseArguments(argc, argv);
 
     if (NTIMES < 2)
         throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
 
+    // Config grid size and group size for kernel launching
+    int gridSize;
+    if (groups) {
+        gridSize = groups * groupSize;
+    } else  {
+        gridSize = ARRAY_SIZE;
+    }
+
+    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
+    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
+    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
+    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
+    if (groups) std::cout << "Using looper kernels:" << std::endl;
+
     std::cout << "Precision: ";
     if (useFloat) std::cout << "float";
     else std::cout << "double";
@@ -144,9 +220,10 @@ int main(int argc, char *argv[])
     std::cout << std::setprecision(1) << std::fixed
         << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
         << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+		<< " " << ARRAY_PAD_BYTES << " bytes padding"
         << std::endl;
-    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+    std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
+        << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
         << std::endl;
 
     // Reset precision
@@ -161,24 +238,31 @@ int main(int argc, char *argv[])
     hipSetDevice(deviceIndex);
     check_cuda_error();
 
-    // Print out device name
-    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
 
-    // Print out device CUDA driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-    // Check buffers fit on the device
     hipDeviceProp_t props;
     hipGetDeviceProperties(&props, deviceIndex);
+
+    // Print out device name
+    std::cout << "Using HIP device " << getDeviceName(deviceIndex) <<  " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
+
+    // Print out device HIP driver version
+    std::cout << "Driver: " << getDriver() << std::endl;
+
+
+
+
+    // Check buffers fit on the device
     if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
         throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    //int cus = props.multiProcessorCount;
 
-    // Initilise arrays
+    // Create host vectors
+    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+
+    // Initialise arrays
     for (unsigned int i = 0; i < ARRAY_SIZE; i++)
     {
         if (useFloat)
@@ -196,12 +280,14 @@ int main(int argc, char *argv[])
     }
 
     // Create device buffers
-    void * d_a, * d_b, *d_c;
-    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
+    char * d_a, * d_b, *d_c;
+    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
     check_cuda_error();
-    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_b += ARRAY_PAD_BYTES;
     check_cuda_error();
-    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_c += ARRAY_PAD_BYTES;
     check_cuda_error();
 
     // Copy host memory to device
@@ -212,10 +298,17 @@ int main(int argc, char *argv[])
     hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
     check_cuda_error();
 
+
+    std::cout << "d_a=" << (void*)d_a << std::endl;
+	std::cout << "d_b=" << (void*)d_b << std::endl;
+	std::cout << "d_c=" << (void*)d_c << std::endl;
+
     // Make sure the copies are finished
     hipDeviceSynchronize();
     check_cuda_error();
 
+
+
     // List of times
     std::vector< std::vector<double> > timings;
 
@@ -227,10 +320,17 @@ int main(int argc, char *argv[])
     {
         std::vector<double> times;
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -239,10 +339,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -251,10 +358,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -263,10 +377,18 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
+
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -316,8 +438,15 @@ int main(int argc, char *argv[])
         }
     }
 
-    for (int j = 0; j < 4; j++)
+    for (int j = 0; j < 4; j++) {
         avg[j] /= (double)(NTIMES-1);
+    }
+
+    double geomean = 1.0;
+    for (int j = 0; j < 4; j++) {
+        geomean *= (sizes[j]/min[j]);
+    }
+    geomean = pow(geomean, 0.25);
 
     // Display results
     std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
@@ -339,6 +468,10 @@ int main(int argc, char *argv[])
             << std::left << std::setw(12) << std::setprecision(5) << avg[j]
             << std::endl;
     }
+    std::cout
+        << std::left << std::setw(12) << "GEOMEAN"
+        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
+        << std::endl;
 
     // Free host vectors
     free(h_a);
diff --git a/runhip.sh b/runhip.sh
new file mode 100755
index 0000000..b84e970
--- /dev/null
+++ b/runhip.sh
@@ -0,0 +1,4 @@
+./gpu-stream-hip
+./gpu-stream-hip --groups 64 --groupSize 256
+./gpu-stream-hip --float
+./gpu-stream-hip --float --groups 64 --groupSize 256

From 89fec9c8d2186c698018ba07bdea5d3875a28d0d Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Wed, 23 Mar 2016 05:26:34 -0500
Subject: [PATCH 03/10] Remove results submission for seperate commits

---
 results/cuda/nvidia-gtx-titan_x.txt | 15 ---------------
 results/hip/amd-fiji-nano.txt       | 15 ---------------
 results/hip/nvidia-gtx-titan_x.txt  | 22 ----------------------
 3 files changed, 52 deletions(-)
 delete mode 100644 results/cuda/nvidia-gtx-titan_x.txt
 delete mode 100644 results/hip/amd-fiji-nano.txt
 delete mode 100644 results/hip/nvidia-gtx-titan_x.txt

diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
deleted file mode 100644
index 2d3b004..0000000
--- a/results/cuda/nvidia-gtx-titan_x.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: CUDA
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB)
-Total size: 1200.0 MB (=1.2 GB)
-Using CUDA device GeForce GTX TITAN X
-Driver: 7050
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        263155.587  0.00319     0.00319     0.00319     
-Mul         262943.430  0.00319     0.00319     0.00319     
-Add         268710.444  0.00468     0.00469     0.00469     
-Triad       268957.305  0.00468     0.00469     0.00468    
diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
deleted file mode 100644
index df40ac8..0000000
--- a/results/hip/amd-fiji-nano.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: CUDA
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB)
-Total size: 1200.0 MB (=1.2 GB)
-Using CUDA device Fiji
-Driver: 4
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        375822.410  0.00223     0.00225     0.00224     
-Mul         375086.879  0.00224     0.00227     0.00224     
-Add         425650.718  0.00296     0.00298     0.00297     
-Triad       424710.113  0.00296     0.00298     0.00298
diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
deleted file mode 100644
index 6bb94a3..0000000
--- a/results/hip/nvidia-gtx-titan_x.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: HIP
-GridSize: 52428800 work-items
-GroupSize: 1024 work-items
-Operations/Work-item: 1
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB) 0 bytes padding
-Total size: 1200.0 MB (=1.2 GB)
-Using HIP device GeForce GTX TITAN X (compute_units=24)
-Driver: 4
-d_a=0x1306d80000
-d_b=0x131fd80000
-d_c=0x1338d80000
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        263042.207  0.00319     0.00320     0.00319     
-Mul         262972.033  0.00319     0.00320     0.00319     
-Add         268732.653  0.00468     0.00469     0.00469     
-Triad       268706.197  0.00468     0.00469     0.00469     
-GEOMEAN     265847.929 

From 8e9ab4d20a20a31d483eb583d9bd038756d37cad Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Wed, 23 Mar 2016 05:29:10 -0500
Subject: [PATCH 04/10] Submit results for NV Titan X with CUDA, AMD FIJI NANO
 and NV Titan X with HIP

---
 results/cuda/nvidia-gtx-titan_x.txt | 15 +++++++++++++++
 results/hip/amd-fiji-nano.txt       | 15 +++++++++++++++
 results/hip/nvidia-gtx-titan_x.txt  | 22 ++++++++++++++++++++++
 3 files changed, 52 insertions(+)
 create mode 100644 results/cuda/nvidia-gtx-titan_x.txt
 create mode 100644 results/hip/amd-fiji-nano.txt
 create mode 100644 results/hip/nvidia-gtx-titan_x.txt

diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..2d3b004
--- /dev/null
+++ b/results/cuda/nvidia-gtx-titan_x.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device GeForce GTX TITAN X
+Driver: 7050
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263155.587  0.00319     0.00319     0.00319     
+Mul         262943.430  0.00319     0.00319     0.00319     
+Add         268710.444  0.00468     0.00469     0.00469     
+Triad       268957.305  0.00468     0.00469     0.00468    
diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
new file mode 100644
index 0000000..df40ac8
--- /dev/null
+++ b/results/hip/amd-fiji-nano.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device Fiji
+Driver: 4
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        375822.410  0.00223     0.00225     0.00224     
+Mul         375086.879  0.00224     0.00227     0.00224     
+Add         425650.718  0.00296     0.00298     0.00297     
+Triad       424710.113  0.00296     0.00298     0.00298
diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..6bb94a3
--- /dev/null
+++ b/results/hip/nvidia-gtx-titan_x.txt
@@ -0,0 +1,22 @@
+GPU-STREAM
+Version: 1.0
+Implementation: HIP
+GridSize: 52428800 work-items
+GroupSize: 1024 work-items
+Operations/Work-item: 1
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB) 0 bytes padding
+Total size: 1200.0 MB (=1.2 GB)
+Using HIP device GeForce GTX TITAN X (compute_units=24)
+Driver: 4
+d_a=0x1306d80000
+d_b=0x131fd80000
+d_c=0x1338d80000
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263042.207  0.00319     0.00320     0.00319     
+Mul         262972.033  0.00319     0.00320     0.00319     
+Add         268732.653  0.00468     0.00469     0.00469     
+Triad       268706.197  0.00468     0.00469     0.00469     
+GEOMEAN     265847.929 

From 207701219a6346bf7f833f9f463fc9f93a6c3971 Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Sun, 3 Apr 2016 06:49:56 -0500
Subject: [PATCH 05/10] Add looper optimization for cuda-stream.cu, remove
 result files

---
 cuda-stream.cu                      | 142 ++++++++++++++++++++++++----
 results/cuda/nvidia-gtx-titan_x.txt |  15 ---
 results/hip/amd-fiji-nano.txt       |  15 ---
 results/hip/nvidia-gtx-titan_x.txt  |  22 -----
 runcuda.sh                          |   4 +
 5 files changed, 130 insertions(+), 68 deletions(-)
 delete mode 100644 results/cuda/nvidia-gtx-titan_x.txt
 delete mode 100644 results/hip/amd-fiji-nano.txt
 delete mode 100644 results/hip/nvidia-gtx-titan_x.txt
 create mode 100755 runcuda.sh

diff --git a/cuda-stream.cu b/cuda-stream.cu
index 2ab3adb..2049eb0 100644
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@@ -62,6 +62,59 @@ void check_cuda_error(void)
     }
 }
 
+// looper function place more work inside each work item.
+// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
+template <typename T, int CLUMP_SIZE>
+__global__ void
+copy_looper(const T * a, T * c, int ARRAY_SIZE)
+{
+    int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE;
+    int stride = blockDim.x * gridDim.x * CLUMP_SIZE;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i];
+    }
+}
+
+template <typename T>
+__global__ void
+mul_looper(T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        b[i] = scalar * c[i];
+    }
+}
+
+template <typename T>
+__global__ void
+add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE)
+{
+    int offset = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template <typename T>
+__global__ void
+triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = blockDim.x * blockIdx.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        a[i] = b[i] + scalar * c[i];
+    }
+}
+
+
 template <typename T>
 __global__ void copy(const T * a, T * c)
 {
@@ -106,6 +159,20 @@ int main(int argc, char *argv[])
     if (NTIMES < 2)
         throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
 
+    // Config grid size and group size for kernel launching
+    int gridSize;
+    if (groups) {
+        gridSize = groups * groupSize;
+    } else  {
+        gridSize = ARRAY_SIZE;
+    }
+
+    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
+    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
+    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
+    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
+    if (groups) std::cout << "Using looper kernels:" << std::endl;
+
     std::cout << "Precision: ";
     if (useFloat) std::cout << "float";
     else std::cout << "double";
@@ -211,6 +278,10 @@ int main(int argc, char *argv[])
     cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
     check_cuda_error();
 
+    std::cout << "d_a=" << (void*)d_a << std::endl;
+	std::cout << "d_b=" << (void*)d_b << std::endl;
+	std::cout << "d_c=" << (void*)d_c << std::endl;
+    
     // Make sure the copies are finished
     cudaDeviceSynchronize();
     check_cuda_error();
@@ -226,10 +297,18 @@ int main(int argc, char *argv[])
     {
         std::vector<double> times;
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
-        else
-            copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
+            else
+                copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
+            else
+                copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
+        }
+        
         check_cuda_error();
         cudaDeviceSynchronize();
         check_cuda_error();
@@ -238,10 +317,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
-        else
-            mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                mul_looper<float><<<gridSize,groupSize>>>((float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                mul_looper<double><<<gridSize,groupSize>>>((double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
+            else
+                mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         cudaDeviceSynchronize();
         check_cuda_error();
@@ -250,10 +336,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                add_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                add_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         cudaDeviceSynchronize();
         check_cuda_error();
@@ -262,10 +355,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                triad_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                triad_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         cudaDeviceSynchronize();
         check_cuda_error();
@@ -318,6 +418,12 @@ int main(int argc, char *argv[])
     for (int j = 0; j < 4; j++)
         avg[j] /= (double)(NTIMES-1);
 
+    double geomean = 1.0;
+    for (int j = 0; j < 4; j++) {
+        geomean *= (sizes[j]/min[j]);
+    }
+    geomean = pow(geomean, 0.25);
+    
     // Display results
     std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
     std::cout
@@ -338,6 +444,10 @@ int main(int argc, char *argv[])
             << std::left << std::setw(12) << std::setprecision(5) << avg[j]
             << std::endl;
     }
+    std::cout
+        << std::left << std::setw(12) << "GEOMEAN"
+        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
+        << std::endl;
 
     // Free host vectors
     free(h_a);
diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
deleted file mode 100644
index 2d3b004..0000000
--- a/results/cuda/nvidia-gtx-titan_x.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: CUDA
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB)
-Total size: 1200.0 MB (=1.2 GB)
-Using CUDA device GeForce GTX TITAN X
-Driver: 7050
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        263155.587  0.00319     0.00319     0.00319     
-Mul         262943.430  0.00319     0.00319     0.00319     
-Add         268710.444  0.00468     0.00469     0.00469     
-Triad       268957.305  0.00468     0.00469     0.00468    
diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
deleted file mode 100644
index df40ac8..0000000
--- a/results/hip/amd-fiji-nano.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: CUDA
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB)
-Total size: 1200.0 MB (=1.2 GB)
-Using CUDA device Fiji
-Driver: 4
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        375822.410  0.00223     0.00225     0.00224     
-Mul         375086.879  0.00224     0.00227     0.00224     
-Add         425650.718  0.00296     0.00298     0.00297     
-Triad       424710.113  0.00296     0.00298     0.00298
diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
deleted file mode 100644
index 6bb94a3..0000000
--- a/results/hip/nvidia-gtx-titan_x.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-GPU-STREAM
-Version: 1.0
-Implementation: HIP
-GridSize: 52428800 work-items
-GroupSize: 1024 work-items
-Operations/Work-item: 1
-Precision: double
-
-Running kernels 10 times
-Array size: 400.0 MB (=0.4 GB) 0 bytes padding
-Total size: 1200.0 MB (=1.2 GB)
-Using HIP device GeForce GTX TITAN X (compute_units=24)
-Driver: 4
-d_a=0x1306d80000
-d_b=0x131fd80000
-d_c=0x1338d80000
-Function    MBytes/sec  Min (sec)   Max         Average     
-Copy        263042.207  0.00319     0.00320     0.00319     
-Mul         262972.033  0.00319     0.00320     0.00319     
-Add         268732.653  0.00468     0.00469     0.00469     
-Triad       268706.197  0.00468     0.00469     0.00469     
-GEOMEAN     265847.929 
diff --git a/runcuda.sh b/runcuda.sh
new file mode 100755
index 0000000..7acf5c1
--- /dev/null
+++ b/runcuda.sh
@@ -0,0 +1,4 @@
+./gpu-stream-cuda
+./gpu-stream-cuda --groups 64 --groupSize 256
+./gpu-stream-cuda --float
+./gpu-stream-cuda --float --groups 64 --groupSize 256

From d73917ec85f8ef4a0231c8278adc9fc6fdec9f6c Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Sun, 3 Apr 2016 06:50:53 -0500
Subject: [PATCH 06/10] Add cuda results for titan x device.

---
 results/cuda/nvidia-gtx-titan_x.txt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 results/cuda/nvidia-gtx-titan_x.txt

diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..6bb94a3
--- /dev/null
+++ b/results/cuda/nvidia-gtx-titan_x.txt
@@ -0,0 +1,22 @@
+GPU-STREAM
+Version: 1.0
+Implementation: HIP
+GridSize: 52428800 work-items
+GroupSize: 1024 work-items
+Operations/Work-item: 1
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB) 0 bytes padding
+Total size: 1200.0 MB (=1.2 GB)
+Using HIP device GeForce GTX TITAN X (compute_units=24)
+Driver: 4
+d_a=0x1306d80000
+d_b=0x131fd80000
+d_c=0x1338d80000
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263042.207  0.00319     0.00320     0.00319     
+Mul         262972.033  0.00319     0.00320     0.00319     
+Add         268732.653  0.00468     0.00469     0.00469     
+Triad       268706.197  0.00468     0.00469     0.00469     
+GEOMEAN     265847.929 

From ef48e0448ae88920779c99e2d841bf3e4b10abdc Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Sun, 3 Apr 2016 06:51:51 -0500
Subject: [PATCH 07/10] Add result of hip on amd FIJI Nano.

---
 results/hip/amd-fiji-nano.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 results/hip/amd-fiji-nano.txt

diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt
new file mode 100644
index 0000000..df40ac8
--- /dev/null
+++ b/results/hip/amd-fiji-nano.txt
@@ -0,0 +1,15 @@
+GPU-STREAM
+Version: 1.0
+Implementation: CUDA
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB)
+Total size: 1200.0 MB (=1.2 GB)
+Using CUDA device Fiji
+Driver: 4
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        375822.410  0.00223     0.00225     0.00224     
+Mul         375086.879  0.00224     0.00227     0.00224     
+Add         425650.718  0.00296     0.00298     0.00297     
+Triad       424710.113  0.00296     0.00298     0.00298

From e16123222d1bc98e918ca2ed109766b6c0e5fe7d Mon Sep 17 00:00:00 2001
From: pensun <peng.sun@amd.com>
Date: Sun, 3 Apr 2016 06:52:31 -0500
Subject: [PATCH 08/10] Add results of HIP on Nvidia Titan X device.

---
 results/hip/nvidia-gtx-titan_x.txt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 results/hip/nvidia-gtx-titan_x.txt

diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt
new file mode 100644
index 0000000..6bb94a3
--- /dev/null
+++ b/results/hip/nvidia-gtx-titan_x.txt
@@ -0,0 +1,22 @@
+GPU-STREAM
+Version: 1.0
+Implementation: HIP
+GridSize: 52428800 work-items
+GroupSize: 1024 work-items
+Operations/Work-item: 1
+Precision: double
+
+Running kernels 10 times
+Array size: 400.0 MB (=0.4 GB) 0 bytes padding
+Total size: 1200.0 MB (=1.2 GB)
+Using HIP device GeForce GTX TITAN X (compute_units=24)
+Driver: 4
+d_a=0x1306d80000
+d_b=0x131fd80000
+d_c=0x1338d80000
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        263042.207  0.00319     0.00320     0.00319     
+Mul         262972.033  0.00319     0.00320     0.00319     
+Add         268732.653  0.00468     0.00469     0.00469     
+Triad       268706.197  0.00468     0.00469     0.00469     
+GEOMEAN     265847.929 

From 998985240110f5d402d8bee81dc16bd7a3935eee Mon Sep 17 00:00:00 2001
From: pensun <Peng.Sun@amd.com>
Date: Tue, 26 Apr 2016 14:10:32 -0500
Subject: [PATCH 09/10] Remove CLUMP_SIZE options; update warning messege
 regarding round errors on float that does not apply to HIP version

---
 common.cpp     |  4 ++--
 cuda-stream.cu | 10 +++++-----
 hip-stream.cpp | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/common.cpp b/common.cpp
index 122376f..cb94e9e 100644
--- a/common.cpp
+++ b/common.cpp
@@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[])
                 std::cout << "Invalid size" << std::endl;
                 exit(1);
             }
-			
+
         }
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
+            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl;
         }
         else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
         {
diff --git a/cuda-stream.cu b/cuda-stream.cu
index 2049eb0..ea067fb 100644
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@@ -64,12 +64,12 @@ void check_cuda_error(void)
 
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE;
-    int stride = blockDim.x * gridDim.x * CLUMP_SIZE;
+    int offset = (blockDim.x * blockIdx.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x;
 
     for (int i=offset; i<ARRAY_SIZE; i+=stride) {
         c[i] = a[i];
@@ -299,9 +299,9 @@ int main(int argc, char *argv[])
         t1 = std::chrono::high_resolution_clock::now();
         if (groups) {
             if (useFloat)
-                copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
+                copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
             else
-                copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
+                copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
         } else {
             if (useFloat)
                 copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
diff --git a/hip-stream.cpp b/hip-stream.cpp
index 6984b44..d5fc133 100644
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@@ -67,12 +67,12 @@ void check_cuda_error(void)
 
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
-    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x;
 
     for (int i=offset; i<ARRAY_SIZE; i+=stride) {
         c[i] = a[i];
@@ -322,9 +322,9 @@ int main(int argc, char *argv[])
         t1 = std::chrono::high_resolution_clock::now();
         if (groups) {
             if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
             else
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
         } else {
             if (useFloat)
                 hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);

From a8ebdc143882b618c5c99f993b0db5b12fdf615f Mon Sep 17 00:00:00 2001
From: pensun <Peng.Sun@amd.com>
Date: Tue, 26 Apr 2016 14:21:52 -0500
Subject: [PATCH 10/10] change the warning, stating the rounding error on float
 does not apply to AMD devices

---
 common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.cpp b/common.cpp
index cb94e9e..3a6c56f 100644
--- a/common.cpp
+++ b/common.cpp
@@ -136,7 +136,7 @@ void parseArguments(int argc, char *argv[])
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl;
+            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl;
         }
         else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
         {