Remove CLUMP_SIZE options; update warning messege regarding round errors on float that does not apply to HIP version

2016-04-26 14:10:32 -05:00 · 2016-04-26 14:10:32 -05:00 · 9989852401
commit 9989852401
parent 066f667e4a
3 changed files with 12 additions and 12 deletions
--- a/common.cpp
+++ b/common.cpp
@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[])
                std::cout << "Invalid size" << std::endl;
                exit(1);
            }
-			
+
        }
        else if (!strcmp(argv[i], "--float"))
        {
            useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
+            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl;
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@ -64,12 +64,12 @@ void check_cuda_error(void)
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE;
+    int offset = (blockDim.x * blockIdx.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x * CLUMP_SIZE;
+    int stride = blockDim.x * gridDim.x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i];
@ -299,9 +299,9 @@ int main(int argc, char *argv[])
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
-                copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
+                copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
            else
-                copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
+                copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@ -67,12 +67,12 @@ void check_cuda_error(void)
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T, int CLUMP_SIZE>
+template <typename T>
 __global__ void
 copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
 {
-    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
+    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
+    int stride = hipBlockDim_x * hipGridDim_x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i];
@ -322,9 +322,9 @@ int main(int argc, char *argv[])
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
            else
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
+                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);