diff --git a/README.md b/README.md
index a2f6ab6..c776347 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-
 
 For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
 http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
-Clone from the HIP repository in the following link:
+Install the HCC compiler:
+https://bitbucket.org/multicoreware/hcc/wiki/Home
+Install HIP:
 https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
 
 Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
diff --git a/common.cpp b/common.cpp
index 781d70e..122376f 100644
--- a/common.cpp
+++ b/common.cpp
@@ -39,10 +39,13 @@
 // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
 // Use binary powers of two so divides 1024
 unsigned int ARRAY_SIZE = 52428800;
+size_t   ARRAY_PAD_BYTES  = 0;
 
 unsigned int NTIMES = 10;
 
 bool useFloat = false;
+unsigned int  groups   = 0;
+unsigned int  groupSize   = 1024;
 
 unsigned int deviceIndex = 0;
 
@@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output)
     return !strlen(next);
 }
 
+int parseSize(const char *str, size_t *output)
+{
+    char *next;
+    *output = strtoull(str, &next, 0);
+	int l = strlen(str);
+	if (l) {
+		char c = str[l-1]; // last char.
+		if ((c == 'k') || (c == 'K')) {
+			*output *= 1024;
+		}
+		if ((c == 'm') || (c == 'M')) {
+			*output *= (1024*1024);
+		}
+
+	}
+    return !strlen(next);
+}
+
+
 void parseArguments(int argc, char *argv[])
 {
     for (int i = 1; i < argc; i++)
@@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[])
                 exit(1);
             }
         }
+        else if (!strcmp(argv[i], "--groups"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groups))
+            {
+                std::cout << "Invalid group number" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--groupSize"))
+        {
+            if (++i >= argc || !parseUInt(argv[i], &groupSize))
+            {
+                std::cout << "Invalid group size" << std::endl;
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--pad"))
+        {
+            if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
+            {
+                std::cout << "Invalid size" << std::endl;
+                exit(1);
+            }
+			
+        }
         else if (!strcmp(argv[i], "--float"))
         {
             useFloat = true;
@@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[])
             std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
             std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
             std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
+            std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
+            std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
+            std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
             std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
             std::cout << std::endl;
             exit(0);
diff --git a/common.h b/common.h
index a4dd886..9cf61d7 100644
--- a/common.h
+++ b/common.h
@@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]);
 extern void listDevices(void);
 
 extern unsigned int ARRAY_SIZE;
+extern size_t       ARRAY_PAD_BYTES;
 extern unsigned int NTIMES;
 
+extern unsigned int  groups;
+extern unsigned int  groupSize;
 extern bool useFloat;
 
 extern unsigned int deviceIndex;
diff --git a/hip-stream.cpp b/hip-stream.cpp
index ddbcae4..6984b44 100644
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@@ -43,7 +43,7 @@
 #include <cfloat>
 #include <cmath>
 
-#include <cuda.h>
+//#include <cuda.h>
 #include "common.h"
 
 std::string getDeviceName(int device);
@@ -63,15 +63,75 @@ void check_cuda_error(void)
     }
 }
 
+
+
+// looper function place more work inside each work item.
+// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
+template <typename T, int CLUMP_SIZE>
+__global__ void
+copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
+{
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
+    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i];
+    }
+}
+
 template <typename T>
-__global__ void copy(hipLaunchParm lp, const T * a, T * c)
+__global__ void
+mul_looper(hipLaunchParm lp,  T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        b[i] = scalar * c[i];
+    }
+}
+
+template <typename T>
+__global__ void
+add_looper(hipLaunchParm lp,  const T * a, const T * b, T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template <typename T>
+__global__ void
+triad_looper(hipLaunchParm lp,  T * a, const T * b, const T * c, int ARRAY_SIZE)
+{
+    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int stride = hipBlockDim_x * hipGridDim_x;
+    const T scalar = 3.0;
+
+    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
+        a[i] = b[i] + scalar * c[i];
+    }
+}
+
+
+
+
+template <typename T>
+__global__ void
+copy(hipLaunchParm lp,  const T * a, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i];
 }
 
+
 template <typename T>
-__global__ void mul(hipLaunchParm lp, T * b, const T * c)
+__global__ void
+mul(hipLaunchParm lp,  T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c)
 }
 
 template <typename T>
-__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
+__global__ void
+add(hipLaunchParm lp,  const T * a, const T * b, T * c)
 {
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     c[i] = a[i] + b[i];
 }
 
 template <typename T>
-__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
+__global__ void
+triad(hipLaunchParm lp,  T * a, const T * b, const T * c)
 {
     const T scalar = 3.0;
     const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -100,13 +162,27 @@ int main(int argc, char *argv[])
     std::cout
         << "GPU-STREAM" << std::endl
         << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: CUDA" << std::endl;
+        << "Implementation: HIP" << std::endl;
 
     parseArguments(argc, argv);
 
     if (NTIMES < 2)
         throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
 
+    // Config grid size and group size for kernel launching
+    int gridSize;
+    if (groups) {
+        gridSize = groups * groupSize;
+    } else  {
+        gridSize = ARRAY_SIZE;
+    }
+
+    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
+    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
+    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
+    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
+    if (groups) std::cout << "Using looper kernels:" << std::endl;
+
     std::cout << "Precision: ";
     if (useFloat) std::cout << "float";
     else std::cout << "double";
@@ -144,9 +220,10 @@ int main(int argc, char *argv[])
     std::cout << std::setprecision(1) << std::fixed
         << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
         << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+		<< " " << ARRAY_PAD_BYTES << " bytes padding"
         << std::endl;
-    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
+    std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
+        << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
         << std::endl;
 
     // Reset precision
@@ -161,24 +238,31 @@ int main(int argc, char *argv[])
     hipSetDevice(deviceIndex);
     check_cuda_error();
 
-    // Print out device name
-    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
 
-    // Print out device CUDA driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-    // Check buffers fit on the device
     hipDeviceProp_t props;
     hipGetDeviceProperties(&props, deviceIndex);
+
+    // Print out device name
+    std::cout << "Using HIP device " << getDeviceName(deviceIndex) <<  " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
+
+    // Print out device HIP driver version
+    std::cout << "Driver: " << getDriver() << std::endl;
+
+
+
+
+    // Check buffers fit on the device
     if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
         throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
+    //int cus = props.multiProcessorCount;
 
-    // Initilise arrays
+    // Create host vectors
+    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
+
+    // Initialise arrays
     for (unsigned int i = 0; i < ARRAY_SIZE; i++)
     {
         if (useFloat)
@@ -196,12 +280,14 @@ int main(int argc, char *argv[])
     }
 
     // Create device buffers
-    void * d_a, * d_b, *d_c;
-    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
+    char * d_a, * d_b, *d_c;
+    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
     check_cuda_error();
-    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_b += ARRAY_PAD_BYTES;
     check_cuda_error();
-    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
+    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
+    d_c += ARRAY_PAD_BYTES;
     check_cuda_error();
 
     // Copy host memory to device
@@ -212,10 +298,17 @@ int main(int argc, char *argv[])
     hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
     check_cuda_error();
 
+
+    std::cout << "d_a=" << (void*)d_a << std::endl;
+	std::cout << "d_b=" << (void*)d_b << std::endl;
+	std::cout << "d_c=" << (void*)d_c << std::endl;
+
     // Make sure the copies are finished
     hipDeviceSynchronize();
     check_cuda_error();
 
+
+
     // List of times
     std::vector< std::vector<double> > timings;
 
@@ -227,10 +320,17 @@ int main(int argc, char *argv[])
     {
         std::vector<double> times;
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -239,10 +339,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -251,10 +358,17 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -263,10 +377,18 @@ int main(int argc, char *argv[])
 
 
         t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        if (groups) {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
+        } else {
+            if (useFloat)
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
+            else
+                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
+        }
+
         check_cuda_error();
         hipDeviceSynchronize();
         check_cuda_error();
@@ -316,8 +438,15 @@ int main(int argc, char *argv[])
         }
     }
 
-    for (int j = 0; j < 4; j++)
+    for (int j = 0; j < 4; j++) {
         avg[j] /= (double)(NTIMES-1);
+    }
+
+    double geomean = 1.0;
+    for (int j = 0; j < 4; j++) {
+        geomean *= (sizes[j]/min[j]);
+    }
+    geomean = pow(geomean, 0.25);
 
     // Display results
     std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
@@ -339,6 +468,10 @@ int main(int argc, char *argv[])
             << std::left << std::setw(12) << std::setprecision(5) << avg[j]
             << std::endl;
     }
+    std::cout
+        << std::left << std::setw(12) << "GEOMEAN"
+        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
+        << std::endl;
 
     // Free host vectors
     free(h_a);
diff --git a/runhip.sh b/runhip.sh
new file mode 100755
index 0000000..b84e970
--- /dev/null
+++ b/runhip.sh
@@ -0,0 +1,4 @@
+./gpu-stream-hip
+./gpu-stream-hip --groups 64 --groupSize 256
+./gpu-stream-hip --float
+./gpu-stream-hip --float --groups 64 --groupSize 256