Remove CLUMP_SIZE options; update warning messege regarding round errors on float that does not apply to HIP version

This commit is contained in:
pensun 2016-04-26 14:10:32 -05:00
parent 066f667e4a
commit 9989852401
3 changed files with 12 additions and 12 deletions

View File

@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[])
std::cout << "Invalid size" << std::endl; std::cout << "Invalid size" << std::endl;
exit(1); exit(1);
} }
} }
else if (!strcmp(argv[i], "--float")) else if (!strcmp(argv[i], "--float"))
{ {
useFloat = true; useFloat = true;
std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl;
} }
else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
{ {

View File

@ -64,12 +64,12 @@ void check_cuda_error(void)
// looper function place more work inside each work item. // looper function place more work inside each work item.
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
template <typename T, int CLUMP_SIZE> template <typename T>
__global__ void __global__ void
copy_looper(const T * a, T * c, int ARRAY_SIZE) copy_looper(const T * a, T * c, int ARRAY_SIZE)
{ {
int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE; int offset = (blockDim.x * blockIdx.x + threadIdx.x);
int stride = blockDim.x * gridDim.x * CLUMP_SIZE; int stride = blockDim.x * gridDim.x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) { for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i]; c[i] = a[i];
@ -299,9 +299,9 @@ int main(int argc, char *argv[])
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (groups) { if (groups) {
if (useFloat) if (useFloat)
copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE); copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
else else
copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE); copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
} else { } else {
if (useFloat) if (useFloat)
copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c); copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);

View File

@ -67,12 +67,12 @@ void check_cuda_error(void)
// looper function place more work inside each work item. // looper function place more work inside each work item.
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
template <typename T, int CLUMP_SIZE> template <typename T>
__global__ void __global__ void
copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE)
{ {
int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; int stride = hipBlockDim_x * hipGridDim_x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) { for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i]; c[i] = a[i];
@ -322,9 +322,9 @@ int main(int argc, char *argv[])
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (groups) { if (groups) {
if (useFloat) if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
else else
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
} else { } else {
if (useFloat) if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);