diff --git a/cuda-stream.cu b/cuda-stream.cu index 2ab3adb..2049eb0 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -62,6 +62,59 @@ void check_cuda_error(void) } } +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE; + int stride = blockDim.x * gridDim.x * CLUMP_SIZE; + + for (int i=offset; i +__global__ void +mul_looper(T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i=offset; i +__global__ void +triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i __global__ void copy(const T * a, T * c) { @@ -106,6 +159,20 @@ int main(int argc, char *argv[]) if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -211,6 +278,10 @@ int main(int argc, char *argv[]) cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); check_cuda_error(); + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished cudaDeviceSynchronize(); check_cuda_error(); @@ -226,10 +297,18 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); + else + copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + copy<<>>((float*)d_a, (float*)d_c); + else + copy<<>>((double*)d_a, (double*)d_c); + } + check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -238,10 +317,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + mul_looper<<>>((float*)d_b, (float*)d_c, ARRAY_SIZE); + else + mul_looper<<>>((double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + mul<<>>((float*)d_b, (float*)d_c); + else + mul<<>>((double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -250,10 +336,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + add_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + add_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + add<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -262,10 +355,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + triad_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + triad_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -318,6 +418,12 @@ int main(int argc, char *argv[]) for (int j = 0; j < 4; j++) avg[j] /= (double)(NTIMES-1); + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); + // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; std::cout @@ -338,6 +444,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt deleted file mode 100644 index 2d3b004..0000000 --- a/results/cuda/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device GeForce GTX TITAN X -Driver: 7050 -Function MBytes/sec Min (sec) Max Average -Copy 263155.587 0.00319 0.00319 0.00319 -Mul 262943.430 0.00319 0.00319 0.00319 -Add 268710.444 0.00468 0.00469 0.00469 -Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt deleted file mode 100644 index df40ac8..0000000 --- a/results/hip/amd-fiji-nano.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device Fiji -Driver: 4 -Function MBytes/sec Min (sec) Max Average -Copy 375822.410 0.00223 0.00225 0.00224 -Mul 375086.879 0.00224 0.00227 0.00224 -Add 425650.718 0.00296 0.00298 0.00297 -Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt deleted file mode 100644 index 6bb94a3..0000000 --- a/results/hip/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,22 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: HIP -GridSize: 52428800 work-items -GroupSize: 1024 work-items -Operations/Work-item: 1 -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) 0 bytes padding -Total size: 1200.0 MB (=1.2 GB) -Using HIP device GeForce GTX TITAN X (compute_units=24) -Driver: 4 -d_a=0x1306d80000 -d_b=0x131fd80000 -d_c=0x1338d80000 -Function MBytes/sec Min (sec) Max Average -Copy 263042.207 0.00319 0.00320 0.00319 -Mul 262972.033 0.00319 0.00320 0.00319 -Add 268732.653 0.00468 0.00469 0.00469 -Triad 268706.197 0.00468 0.00469 0.00469 -GEOMEAN 265847.929 diff --git a/runcuda.sh b/runcuda.sh new file mode 100755 index 0000000..7acf5c1 --- /dev/null +++ b/runcuda.sh @@ -0,0 +1,4 @@ +./gpu-stream-cuda +./gpu-stream-cuda --groups 64 --groupSize 256 +./gpu-stream-cuda --float +./gpu-stream-cuda --float --groups 64 --groupSize 256