From 9954b7d38cd85d20927428425a9840a72a56c3e4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 6 Oct 2023 17:56:42 +0100 Subject: [PATCH] Set CUDA dot kernel to use number of blocks relative to device property This aligns with the approach implemented in other models (SYCL 1.2.1 and HIP) Cherry-picks the CUDA updates from lmeadows in #122 --- src/cuda/CUDAStream.cu | 26 ++++++++++++++++---------- src/cuda/CUDAStream.h | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 573d24e..c4a5e61 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -51,16 +51,22 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) #endif array_size = ARRAY_SIZE; + + // Query device for sensible dot kernel block count + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_index); + check_error(); + dot_num_blocks = props.multiProcessorCount * 4; + // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(sizeof(T) * dot_num_blocks); size_t array_bytes = sizeof(T); array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; + size_t total_bytes = array_bytes * 4; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, 0); if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); @@ -72,13 +78,13 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) check_error(); cudaMallocManaged(&d_c, array_bytes); check_error(); - cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #elif defined(PAGEFAULT) d_a = (T*)malloc(array_bytes); d_b = (T*)malloc(array_bytes); d_c = (T*)malloc(array_bytes); - d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS); + d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); #else cudaMalloc(&d_a, array_bytes); check_error(); @@ -86,7 +92,7 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, array_bytes); check_error(); - cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #endif } @@ -267,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, d_sum, array_size); check_error(); #if defined(MANAGED) || defined(PAGEFAULT) cudaDeviceSynchronize(); check_error(); #else - cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost); check_error(); #endif T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < dot_num_blocks; i++) { #if defined(MANAGED) || defined(PAGEFAULT) sum += d_sum[i]; diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 5ff4f9b..d16511f 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -16,7 +16,6 @@ #define IMPLEMENTATION_STRING "CUDA" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream @@ -34,6 +33,8 @@ class CUDAStream : public Stream T *d_c; T *d_sum; + // Number of blocks for dot kernel + int dot_num_blocks; public: