From 11053798ff7dca2bd8eb7a8e5bacac10758b5159 Mon Sep 17 00:00:00 2001
From: sunway513 <peng.sun@amd.com>
Date: Tue, 15 Mar 2016 07:56:32 -0500
Subject: [PATCH] Improved GPU-STREAM benchmark for HIP version: 1. Add
 optional looper kernels to take command line input for the number of groups
 and groupSize 2. Add GEOMEAN value calculation of the kernels 3. Instructions
 on configure HIP environment in the README.md 4. Add results for HIP on FIJI
 Nano, TITAN X; CUDA on TITAN X 5. Run script to optionally run HIP version
 with groups and groupSize options

---
 README.md      |   4 +-
 common.cpp     |  50 ++++++++++++
 common.h       |   3 +
 hip-stream.cpp | 213 +++++++++++++++++++++++++++++++++++++++----------
 runhip.sh      |   4 +
 5 files changed, 233 insertions(+), 41 deletions(-)
 create mode 100755 runhip.sh

diff --git a/README.md b/README.md
index a2f6ab6..c776347 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-
 
 For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
 http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
-Clone from the HIP repository in the following link:
+Install the HCC compiler:
+https://bitbucket.org/multicoreware/hcc/wiki/Home
+Install HIP:
 https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
 
 Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
diff --git a/common.cpp b/common.cpp
index 781d70e..122376f 100644
--- a/common.cpp
+++ b/common.cpp
@@ -39,10 +39,13 @@
 // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
 // Use binary powers of two so divides 1024
 unsigned int ARRAY_SIZE = 52428800;
+size_t   ARRAY_PAD_BYTES  = 0;
 
 unsigned int NTIMES = 10;
 
 bool useFloat = false;
+unsigned int  groups   = 0;
+unsigned int  groupSize   = 1024;
 
 unsigned int deviceIndex = 0;
 
@@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output)
     return !strlen(next);
 }
 
+int parseSize(const char *str, size_t *output)
+{
+    char *next;
+    *output = strtoull(str, &next, 0);
+	int l = strlen(str);
+	if (l) {
+		char c = str[l-1]; // last char.
+		if ((c == 'k') || (c == 'K')) {
+			*output *= 1024;
+		}
+		if ((c == 'm') || (c == 'M')) {
+			*output *= (1024*1024);
+		}
+
+	}
+    return !strlen(next);
+}
+
+
 void parseArguments(int argc, char *argv[])
 {
     for (int i = 1; i < argc; i++)
@@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[])
                 exit(1);
             }
         }
+        else if (!strcmp(argv[i], "--groups"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groups))
+            {
+                std::cout << "Invalid group number" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--groupSize"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groupSize))
+            {
+                std::cout << "Invalid group size" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--pad"))
+        {
+            if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
+            {
+                std::cout << "Invalid size" << std::endl;
+                exit(1);
+            }
+			
+        }
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
@@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[])
             std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
             std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
             std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
+            std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
+            std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
+            std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
             std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
             std::cout << std::endl;
             exit(0);
diff --git a/common.h b/common.h
index a4dd886..9cf61d7 100644
--- a/common.h
+++ b/common.h
@@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]);
 extern void listDevices(void);
 
 extern unsigned int ARRAY_SIZE;
+extern size_t       ARRAY_PAD_BYTES;
 extern unsigned int NTIMES;
 
+extern unsigned int  groups;
+extern unsigned int  groupSize;
 extern bool useFloat;
 
 extern unsigned int deviceIndex;
diff --git a/hip-stream.cpp b/hip-stream.cpp
index ddbcae4..6984b44 100644
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@@ -43,7 +43,7 @@
 #include <cfloat>
 #include <cmath>
 
-#include <cuda.h>
+//#include <cuda.h>
 #include "common.h"
 
 std::string getDeviceName(int device);
@@ -63,15 +63,75 @@ void check_cuda_error(void)
     }
 }
 
+
+
+// looper function place more work inside each work item.
+// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
+template <typename T, int CLUMP_SIZE>
+__global__ void
+copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
+{
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
+    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i];
+    }
+}
+
 template <typename T>
-__global__ void copy(hipLaunchParm lp, const T * a, T * c)
+__global__ void
+mul_looper(hipLaunchParm lp,  T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        b[i] = scalar * c[i];
+    }
+}
+
+template <typename T>
+__global__ void
+add_looper(hipLaunchParm lp,  const T * a, const T * b, T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template <typename T>
+__global__ void
+triad_looper(hipLaunchParm lp,  T * a, const T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        a[i] = b[i] + scalar * c[i];
+    }
+}
+
+
+
+
+template <typename T>
+__global__ void
+copy(hipLaunchParm lp,  const T * a, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i];
 }
 
+
 template <typename T>
-__global__ void mul(hipLaunchParm lp, T * b, const T * c)
+__global__ void
+mul(hipLaunchParm lp,  T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c)
 }
 
 template <typename T>
-__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
+__global__ void
+add(hipLaunchParm lp,  const T * a, const T * b, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i] + b[i];
 }
 
 template <typename T>
-__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
+__global__ void
+triad(hipLaunchParm lp,  T * a, const T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -100,13 +162,27 @@ int main(int argc, char *argv[])
     std::cout
         << "GPU-STREAM" << std::endl
         << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: CUDA" << std::endl;
+        << "Implementation: HIP" << std::endl;
 
     parseArguments(argc, argv);
 
     if (NTIMES < 2)
         throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
 
+    // Config grid size and group size for kernel launching
+    int gridSize;
+    if (groups) {
+        gridSize = groups * groupSize;
+    } else  {
+        gridSize = ARRAY_SIZE;
+    }
+
+    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
+    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
+    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
+    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
+    if (groups) std::cout << "Using looper kernels:" << std::endl;
+
     std::cout << "Precision: ";
     if (useFloat) std::cout << "float";
     else std::cout << "double";
@@ -144,9 +220,10 @@ int main(int argc, char *argv[])
     std::cout << std::setprecision(1) << std::fixed
         << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
         << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+		<< " " << ARRAY_PAD_BYTES << " bytes padding"
         << std::endl;
-    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+    std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
+        << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
         << std::endl;
 
     // Reset precision
@@ -161,24 +238,31 @@ int main(int argc, char *argv[])
     hipSetDevice(deviceIndex);
     check_cuda_error();
 
-    // Print out device name
-    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
 
-    // Print out device CUDA driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-    // Check buffers fit on the device
     hipDeviceProp_t props;
     hipGetDeviceProperties(&props, deviceIndex);
+
+    // Print out device name
+    std::cout << "Using HIP device " << getDeviceName(deviceIndex) <<  " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
+
+    // Print out device HIP driver version
+    std::cout << "Driver: " << getDriver() << std::endl;
+
+
+
+
+    // Check buffers fit on the device
     if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
         throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    //int cus = props.multiProcessorCount;
 
-    // Initilise arrays
+    // Create host vectors
+    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+
+    // Initialise arrays
     for (unsigned int i = 0; i < ARRAY_SIZE; i++)
     {
         if (useFloat)
@@ -196,12 +280,14 @@ int main(int argc, char *argv[])
     }
 
     // Create device buffers
-    void * d_a, * d_b, *d_c;
-    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
+    char * d_a, * d_b, *d_c;
+    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
     check_cuda_error();
-    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_b += ARRAY_PAD_BYTES;
     check_cuda_error();
-    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_c += ARRAY_PAD_BYTES;
     check_cuda_error();
 
     // Copy host memory to device
@@ -212,10 +298,17 @@ int main(int argc, char *argv[])
     hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
     check_cuda_error();
 
+
+    std::cout << "d_a=" << (void*)d_a << std::endl;
+	std::cout << "d_b=" << (void*)d_b << std::endl;
+	std::cout << "d_c=" << (void*)d_c << std::endl;
+
     // Make sure the copies are finished
     hipDeviceSynchronize();
     check_cuda_error();
 
+
+
     // List of times
     std::vector< std::vector<double> > timings;
 
@@ -227,10 +320,17 @@ int main(int argc, char *argv[])
     {
         std::vector<double> times;
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -239,10 +339,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -251,10 +358,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -263,10 +377,18 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
+
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -316,8 +438,15 @@ int main(int argc, char *argv[])
         }
     }
 
-    for (int j = 0; j < 4; j++)
+    for (int j = 0; j < 4; j++) {
         avg[j] /= (double)(NTIMES-1);
+    }
+
+    double geomean = 1.0;
+    for (int j = 0; j < 4; j++) {
+        geomean *= (sizes[j]/min[j]);
+    }
+    geomean = pow(geomean, 0.25);
 
     // Display results
     std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
@@ -339,6 +468,10 @@ int main(int argc, char *argv[])
             << std::left << std::setw(12) << std::setprecision(5) << avg[j]
             << std::endl;
     }
+    std::cout
+        << std::left << std::setw(12) << "GEOMEAN"
+        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
+        << std::endl;
 
     // Free host vectors
     free(h_a);
diff --git a/runhip.sh b/runhip.sh
new file mode 100755
index 0000000..b84e970
--- /dev/null
+++ b/runhip.sh
@@ -0,0 +1,4 @@
+./gpu-stream-hip
+./gpu-stream-hip --groups 64 --groupSize 256
+./gpu-stream-hip --float
+./gpu-stream-hip --float --groups 64 --groupSize 256