From 998985240110f5d402d8bee81dc16bd7a3935eee Mon Sep 17 00:00:00 2001
From: pensun <Peng.Sun@amd.com>
Date: Tue, 26 Apr 2016 14:10:32 -0500
Subject: [PATCH] Remove CLUMP_SIZE options; update warning messege regarding
 round errors on float that does not apply to HIP version

---
 common.cpp     |  4 ++--
 cuda-stream.cu | 10 +++++-----
 hip-stream.cpp | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/common.cpp b/common.cpp
index 122376f..cb94e9e 100644
--- a/common.cpp
+++ b/common.cpp
@@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[])
                 std::cout << "Invalid size" << std::endl;
                 exit(1);
             }
-			
+
         }
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
+            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl;
         }
         else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
         {
diff --git a/cuda-stream.cu b/cuda-stream.cu
index 2049eb0..ea067fb 100644
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@@ -64,12 +64,12 @@ void check_cuda_error(void)
 
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE;
-    int stride = blockDim.x * gridDim.x * CLUMP_SIZE;
+    int offset = (blockDim.x * blockIdx.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x;
 
     for (int i=offset; i<ARRAY_SIZE; i+=stride) {
         c[i] = a[i];
@@ -299,9 +299,9 @@ int main(int argc, char *argv[])
         t1 = std::chrono::high_resolution_clock::now();
         if (groups) {
             if (useFloat)
-                copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
+                copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
             else
-                copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
+                copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
         } else {
             if (useFloat)
                 copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
diff --git a/hip-stream.cpp b/hip-stream.cpp
index 6984b44..d5fc133 100644
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@@ -67,12 +67,12 @@ void check_cuda_error(void)
 
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
-    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x;
 
     for (int i=offset; i<ARRAY_SIZE; i+=stride) {
         c[i] = a[i];
@@ -322,9 +322,9 @@ int main(int argc, char *argv[])
         t1 = std::chrono::high_resolution_clock::now();
         if (groups) {
             if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
             else
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
         } else {
             if (useFloat)
                 hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);