From 998985240110f5d402d8bee81dc16bd7a3935eee Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 26 Apr 2016 14:10:32 -0500 Subject: [PATCH] Remove CLUMP_SIZE options; update warning messege regarding round errors on float that does not apply to HIP version --- common.cpp | 4 ++-- cuda-stream.cu | 10 +++++----- hip-stream.cpp | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/common.cpp b/common.cpp index 122376f..cb94e9e 100644 --- a/common.cpp +++ b/common.cpp @@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[]) std::cout << "Invalid size" << std::endl; exit(1); } - + } else if (!strcmp(argv[i], "--float")) { useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; + std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl; } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { diff --git a/cuda-stream.cu b/cuda-stream.cu index 2049eb0..ea067fb 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -64,12 +64,12 @@ void check_cuda_error(void) // looper function place more work inside each work item. // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template +template __global__ void copy_looper(const T * a, T * c, int ARRAY_SIZE) { - int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE; - int stride = blockDim.x * gridDim.x * CLUMP_SIZE; + int offset = (blockDim.x * blockIdx.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; for (int i=offset; i<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); + copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); else - copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); + copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); } else { if (useFloat) copy<<>>((float*)d_a, (float*)d_c); diff --git a/hip-stream.cpp b/hip-stream.cpp index 6984b44..d5fc133 100644 --- a/hip-stream.cpp +++ b/hip-stream.cpp @@ -67,12 +67,12 @@ void check_cuda_error(void) // looper function place more work inside each work item. // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template +template __global__ void copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) { - int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; - int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x; for (int i=offset; i), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); else - hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); } else { if (useFloat) hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);