From a075455ad45f00be73b10cc54a8369397221c691 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Sat, 30 Apr 2022 21:59:45 -0500 Subject: [PATCH 1/7] Add tuned benchmark kernels Co-authored-by: Nick Curtis --- src/hip/HIPStream.cpp | 210 +++++++++++++++++++++++++++++++++--------- src/hip/HIPStream.h | 33 +++++++ src/hip/model.cmake | 15 ++- 3 files changed, 212 insertions(+), 46 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 6aed1ee..dcf634e 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -9,7 +9,32 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 + +#ifdef NONTEMPORAL +template +__device__ __forceinline__ T load(const T& ref) +{ + return __builtin_nontemporal_load(&ref); +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + __builtin_nontemporal_store(value, &ref); +} +#else +template +__device__ __forceinline__ T load(const T& ref) +{ + return ref; +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + ref = value; +} +#endif void check_error(void) { @@ -23,15 +48,27 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) + : array_size{ARRAY_SIZE}, + block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) { - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) + std::cerr << "Elements per lane: " << elements_per_lane << std::endl; + std::cerr << "Chunks per block: " << chunks_per_block << std::endl; + // The array size must be divisible by total number of elements + // moved per block for kernel launches + if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) { std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; + ss << "Array size must be a multiple of elements operated on per block (" + << TBSIZE * elements_per_lane * chunks_per_block + << ")."; throw std::runtime_error(ss.str()); } + std::cerr << "block count " << block_count << std::endl; + +#ifdef NONTEMPORAL + std::cerr << "Using non-temporal memory operations." << std::endl; +#endif // Set device int count; @@ -49,7 +86,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(block_count*sizeof(T)); // Check buffers fit on the device hipDeviceProp_t props; @@ -64,7 +101,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + hipMalloc(&d_sum, block_count*sizeof(T)); check_error(); } @@ -115,68 +152,115 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } - -template -__global__ void copy_kernel(const T * a, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void copy_kernel(const T * __restrict a, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void mul_kernel(T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - b[i] = scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); + } + } } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void add_kernel(const T * a, const T * b, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i] + b[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void triad_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] = b[i] + scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); + } + } } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -199,42 +283,78 @@ void HIPStream::nstream() check_error(); } -template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +template +struct Reducer +{ + template + __device__ + static + void reduce(I it) noexcept + { + if (n == 1) return; + +#if defined(__HIP_PLATFORM_NVCC__) + constexpr unsigned int warpSize = 32; +#endif + constexpr bool is_same_warp{n <= warpSize * 2}; + if (static_cast(threadIdx.x) < n/2) + { + it[threadIdx.x] += it[threadIdx.x + n/2]; + } + is_same_warp ? __threadfence_block() : __syncthreads(); + Reducer::reduce(it); + } +}; + +template<> +struct Reducer<1u> { + template + __device__ + static + void reduce(I) noexcept + {} +}; + +template +__launch_bounds__(TBSIZE) +__global__ +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) { __shared__ T tb_sum[TBSIZE]; + const size_t tidx = threadIdx.x; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; - - tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) - tb_sum[local_i] += a[i] * b[i]; - - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) + T tmp{0}; + for (size_t i = 0; i != chunks_per_block; ++i) { - __syncthreads(); - if (local_i < offset) + for (size_t j = 0; j != elements_per_lane; ++j) { - tb_sum[local_i] += tb_sum[local_i+offset]; + tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); } } + tb_sum[tidx] = tmp; + __syncthreads(); - if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + Reducer<>::reduce(tb_sum); + if (tidx) return; + store(tb_sum[0], sum[blockIdx.x]); } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_sum); check_error(); - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < block_count; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 44a2893..ecdf929 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -18,9 +18,42 @@ template class HIPStream : public Stream { +#ifdef __HIP_PLATFORM_NVCC__ + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 1 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 8 + #endif +#else + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 4 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 1 + #endif +#endif + // Make sure that either: + // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; + static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; + // Take into account the datatype size + // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int elements_per_lane{ + (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + protected: // Size of arrays int array_size; + int block_count; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4..3ffaf7a 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,19 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(USE_NONTEMPORAL_MEM + "Flag indicating to use non-temporal memory accesses to bypass cache." + "OFF") + +# TODO: Better flag descriptions +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) +register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) + macro(setup) - # nothing to do here as hipcc does everything correctly, what a surprise! + # Ensure we set the proper preprocessor directives + if (USE_NONTEMPORAL_MEM) + add_definitions(-DNONTEMPORAL) + endif () + register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From bcf8708f2c294187390e69d9b825b2e7dc709001 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Tue, 31 May 2022 11:29:42 -0500 Subject: [PATCH 2/7] Clean up kernels and drop unneeded modifications --- src/hip/HIPStream.cpp | 187 ++++++++++++------------------------------ src/hip/HIPStream.h | 7 -- src/hip/model.cmake | 12 +-- 3 files changed, 53 insertions(+), 153 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index dcf634e..eac77b4 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -10,31 +10,6 @@ #define TBSIZE 1024 -#ifdef NONTEMPORAL -template -__device__ __forceinline__ T load(const T& ref) -{ - return __builtin_nontemporal_load(&ref); -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - __builtin_nontemporal_store(value, &ref); -} -#else -template -__device__ __forceinline__ T load(const T& ref) -{ - return ref; -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - ref = value; -} -#endif void check_error(void) { @@ -49,27 +24,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) + block_count(array_size / (TBSIZE * elements_per_lane)) { std::cerr << "Elements per lane: " << elements_per_lane << std::endl; std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) + if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) { std::stringstream ss; ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane * chunks_per_block + << TBSIZE * elements_per_lane << ")."; throw std::runtime_error(ss.str()); } std::cerr << "block count " << block_count << std::endl; -#ifdef NONTEMPORAL - std::cerr << "Using non-temporal memory operations." << std::endl; -#endif - // Set device int count; hipGetDeviceCount(&count); @@ -86,7 +57,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(block_count*sizeof(T)); + hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; @@ -101,15 +73,14 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, block_count*sizeof(T)); - check_error(); } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -117,15 +88,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -152,26 +121,20 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void copy_kernel(const T * __restrict a, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_c); @@ -180,27 +143,21 @@ void HIPStream::copy() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_b, d_c); @@ -209,26 +166,20 @@ void HIPStream::mul() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -237,27 +188,21 @@ void HIPStream::add() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -266,91 +211,63 @@ void HIPStream::triad() check_error(); } -template -__global__ void nstream_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] += b[i] + scalar * c[i]; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -struct Reducer -{ - template - __device__ - static - void reduce(I it) noexcept - { - if (n == 1) return; - -#if defined(__HIP_PLATFORM_NVCC__) - constexpr unsigned int warpSize = 32; -#endif - constexpr bool is_same_warp{n <= warpSize * 2}; - if (static_cast(threadIdx.x) < n/2) - { - it[threadIdx.x] += it[threadIdx.x + n/2]; - } - is_same_warp ? __threadfence_block() : __syncthreads(); - Reducer::reduce(it); - } -}; - -template<> -struct Reducer<1u> { - template - __device__ - static - void reduce(I) noexcept - {} -}; - -template +template __launch_bounds__(TBSIZE) -__global__ -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - const size_t tidx = threadIdx.x; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; - const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - T tmp{0}; - for (size_t i = 0; i != chunks_per_block; ++i) + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; + + tb_sum[local_i] = 0.0; + for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { - for (size_t j = 0; j != elements_per_lane; ++j) + __syncthreads(); + if (local_i < offset) { - tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); + tb_sum[local_i] += tb_sum[local_i+offset]; } } - tb_sum[tidx] = tmp; - __syncthreads(); - Reducer<>::reduce(tb_sum); - if (tidx) return; - store(tb_sum[0], sum[blockIdx.x]); + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(block_count), dim3(TBSIZE), - 0, 0, d_a, d_b, d_sum); + 0, 0, d_a, d_b, sums, array_size); check_error(); - - hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); check_error(); T sum = 0.0; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index ecdf929..7bce0b5 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -22,16 +22,10 @@ class HIPStream : public Stream #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 1 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 8 - #endif #else #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 4 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 1 - #endif #endif // Make sure that either: // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element @@ -41,7 +35,6 @@ class HIPStream : public Stream (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), "DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 3ffaf7a..2f7d69e 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,19 +2,9 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(USE_NONTEMPORAL_MEM - "Flag indicating to use non-temporal memory accesses to bypass cache." - "OFF") - -# TODO: Better flag descriptions -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) -register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) # Ensure we set the proper preprocessor directives - if (USE_NONTEMPORAL_MEM) - add_definitions(-DNONTEMPORAL) - endif () register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) - register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From f98aedf64d0f62764550d93ee0f1458be2146efd Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:09:57 -0500 Subject: [PATCH 3/7] Use triple-chevron syntax for hip kernel launching --- src/hip/HIPStream.cpp | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index eac77b4..ce69172 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -103,7 +103,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); hipDeviceSynchronize(); check_error(); @@ -134,10 +134,7 @@ void copy_kernel(const T * __restrict a, T * __restrict c) template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -157,10 +154,7 @@ void mul_kernel(T * __restrict b, const T * __restrict c) template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -179,10 +173,7 @@ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -202,10 +193,7 @@ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -224,10 +212,7 @@ __global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -262,10 +247,7 @@ __global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * _ template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); From de93c06e78a7051cfed4a44626ac6fc599f5c24d Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:32:20 -0500 Subject: [PATCH 4/7] Add clarifying comment and further clean-up --- src/hip/HIPStream.cpp | 8 ++++---- src/hip/HIPStream.h | 1 - src/hip/model.cmake | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ce69172..37fce3b 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -27,8 +27,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) block_count(array_size / (TBSIZE * elements_per_lane)) { - std::cerr << "Elements per lane: " << elements_per_lane << std::endl; - std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) @@ -39,7 +37,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) << ")."; throw std::runtime_error(ss.str()); } - std::cerr << "block count " << block_count << std::endl; // Set device int count; @@ -56,7 +53,10 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; - // Allocate the host array for partial sums for dot kernels + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. + // This creates an array on the host which is visible to the device. However, it requires + // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host + // after it has been passed through to a kernel. hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); check_error(); diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 7bce0b5..305e937 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -55,7 +55,6 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; public: diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 2f7d69e..19e6fd0 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -5,6 +5,5 @@ register_flag_required(CMAKE_CXX_COMPILER register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) - # Ensure we set the proper preprocessor directives register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) endmacro() \ No newline at end of file From f44cd6fdd2bf434b91e40e0b117af0e6f05b578a Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Mon, 5 Sep 2022 15:43:37 -0700 Subject: [PATCH 5/7] Roll back modifications for copy, mul, add, and triad --- src/hip/HIPStream.cpp | 50 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 37fce3b..cc1d21f 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -124,17 +124,19 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector template __launch_bounds__(TBSIZE) __global__ -void copy_kernel(const T * __restrict a, T * __restrict c) +void copy_kernel(const T * a, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -143,18 +145,20 @@ void HIPStream::copy() template __launch_bounds__(TBSIZE) __global__ -void mul_kernel(T * __restrict b, const T * __restrict c) +void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - b[gidx + j] = scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + b[i] = scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -163,17 +167,19 @@ void HIPStream::mul() template __launch_bounds__(TBSIZE) __global__ -void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) +void add_kernel(const T * a, const T * b, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j] + b[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i] + b[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -182,18 +188,20 @@ void HIPStream::add() template __launch_bounds__(TBSIZE) __global__ -void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] = b[i] + scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -220,7 +228,7 @@ void HIPStream::nstream() template __launch_bounds__(TBSIZE) -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; From 85d80915f60272b08bd41d5baae561d392003ed3 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 8 Sep 2022 11:44:37 -0500 Subject: [PATCH 6/7] Simplify/roll back unneeded modifications --- src/hip/HIPStream.cpp | 88 ++++++++++++++++++------------------------- src/hip/HIPStream.h | 32 ++++++---------- src/hip/model.cmake | 4 +- 3 files changed, 49 insertions(+), 75 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index cc1d21f..7fc732d 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -23,17 +23,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane)) { - // The array size must be divisible by total number of elements - // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) { std::stringstream ss; - ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // The array size must be divisible by total number of elements + // moved per block for the dot kernel + if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) + { + std::stringstream ss; + ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" + << TBSIZE * dot_elements_per_lane << ")."; throw std::runtime_error(ss.str()); } @@ -52,12 +58,13 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; + dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host // after it has been passed through to a kernel. - hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); check_error(); // Check buffers fit on the device @@ -121,113 +128,90 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void copy_kernel(const T * a, T * c) +template +__global__ void copy_kernel(const T * a, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void mul_kernel(T * b, const T * c) +template +__global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void add_kernel(const T * a, const T * b, T * c) +template +__global__ void add_kernel(const T * a, const T * b, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void triad_kernel(T * a, const T * b, const T * c) +template +__global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - nstream_kernel<<>>(d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; @@ -236,7 +220,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) size_t i = blockDim.x * blockIdx.x + local_i; tb_sum[local_i] = 0.0; - for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) @@ -255,13 +239,13 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T HIPStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); T sum = 0.0; - for (int i = 0; i < block_count; i++) + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 305e937..3c603e0 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,39 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { -#ifdef __HIP_PLATFORM_NVCC__ - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 1 - #endif -#else - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 4 - #endif -#endif // Make sure that either: - // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element // or - // DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size - // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements // and 4 FP32 elements - static constexpr unsigned int elements_per_lane{ - (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; protected: // Size of arrays int array_size; - int block_count; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 19e6fd0..78150c4 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,8 +2,6 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) - macro(setup) - register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + # nothing to do here as hipcc does everything correctly, what a surprise! endmacro() \ No newline at end of file From 696ff6a8179b4fbb070070709c595f6e1a2f02a4 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Mon, 13 Mar 2023 10:47:37 -0500 Subject: [PATCH 7/7] Round up dot_num_blocks and remove extra check --- src/hip/HIPStream.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 7fc732d..0db8485 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -33,17 +33,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error(ss.str()); } - // The array size must be divisible by total number of elements - // moved per block for the dot kernel - if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) - { - std::stringstream ss; - ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" - << TBSIZE * dot_elements_per_lane - << ")."; - throw std::runtime_error(ss.str()); - } - // Set device int count; hipGetDeviceCount(&count); @@ -58,7 +47,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; - dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); + // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) + dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires