From 9954b7d38cd85d20927428425a9840a72a56c3e4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 6 Oct 2023 17:56:42 +0100
Subject: [PATCH] Set CUDA dot kernel to use number of blocks relative to
 device property

This aligns with the approach implemented in other models (SYCL 1.2.1 and HIP)

Cherry-picks the CUDA updates from lmeadows in #122
---
 src/cuda/CUDAStream.cu | 26 ++++++++++++++++----------
 src/cuda/CUDAStream.h  |  3 ++-
 2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index 573d24e..c4a5e61 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -51,16 +51,22 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
 #endif
   array_size = ARRAY_SIZE;
 
+
+  // Query device for sensible dot kernel block count
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, device_index);
+  check_error();
+  dot_num_blocks = props.multiProcessorCount * 4;
+
   // Allocate the host array for partial sums for dot kernels
-  sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
+  sums = (T*)malloc(sizeof(T) * dot_num_blocks);
 
   size_t array_bytes = sizeof(T);
   array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 3;
+  size_t total_bytes = array_bytes * 4;
+  std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
 
   // Check buffers fit on the device
-  cudaDeviceProp props;
-  cudaGetDeviceProperties(&props, 0);
   if (props.totalGlobalMem < total_bytes)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
@@ -72,13 +78,13 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
   check_error();
   cudaMallocManaged(&d_c, array_bytes);
   check_error();
-  cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
+  cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T));
   check_error();
 #elif defined(PAGEFAULT)
   d_a = (T*)malloc(array_bytes);
   d_b = (T*)malloc(array_bytes);
   d_c = (T*)malloc(array_bytes);
-  d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS);
+  d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
 #else
   cudaMalloc(&d_a, array_bytes);
   check_error();
@@ -86,7 +92,7 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
   check_error();
   cudaMalloc(&d_c, array_bytes);
   check_error();
-  cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
+  cudaMalloc(&d_sum, dot_num_blocks*sizeof(T));
   check_error();
 #endif
 }
@@ -267,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<dot_num_blocks, TBSIZE>>>(d_a, d_b, d_sum, array_size);
   check_error();
 
 #if defined(MANAGED) || defined(PAGEFAULT)
   cudaDeviceSynchronize();
   check_error();
 #else
-  cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost);
+  cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost);
   check_error();
 #endif
 
   T sum = 0.0;
-  for (int i = 0; i < DOT_NUM_BLOCKS; i++)
+  for (int i = 0; i < dot_num_blocks; i++)
   {
 #if defined(MANAGED) || defined(PAGEFAULT)
     sum += d_sum[i];
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 5ff4f9b..d16511f 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -16,7 +16,6 @@
 #define IMPLEMENTATION_STRING "CUDA"
 
 #define TBSIZE 1024
-#define DOT_NUM_BLOCKS 1024
 
 template <class T>
 class CUDAStream : public Stream<T>
@@ -34,6 +33,8 @@ class CUDAStream : public Stream<T>
     T *d_c;
     T *d_sum;
 
+    // Number of blocks for dot kernel
+    int dot_num_blocks;
 
   public: