From 85d80915f60272b08bd41d5baae561d392003ed3 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 8 Sep 2022 11:44:37 -0500 Subject: [PATCH] Simplify/roll back unneeded modifications --- src/hip/HIPStream.cpp | 88 ++++++++++++++++++------------------------- src/hip/HIPStream.h | 32 ++++++---------- src/hip/model.cmake | 4 +- 3 files changed, 49 insertions(+), 75 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index cc1d21f..7fc732d 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -23,17 +23,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane)) { - // The array size must be divisible by total number of elements - // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) { std::stringstream ss; - ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // The array size must be divisible by total number of elements + // moved per block for the dot kernel + if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) + { + std::stringstream ss; + ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" + << TBSIZE * dot_elements_per_lane << ")."; throw std::runtime_error(ss.str()); } @@ -52,12 +58,13 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; + dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host // after it has been passed through to a kernel. - hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); check_error(); // Check buffers fit on the device @@ -121,113 +128,90 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void copy_kernel(const T * a, T * c) +template +__global__ void copy_kernel(const T * a, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void mul_kernel(T * b, const T * c) +template +__global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void add_kernel(const T * a, const T * b, T * c) +template +__global__ void add_kernel(const T * a, const T * b, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void triad_kernel(T * a, const T * b, const T * c) +template +__global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - nstream_kernel<<>>(d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; @@ -236,7 +220,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) size_t i = blockDim.x * blockIdx.x + local_i; tb_sum[local_i] = 0.0; - for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) @@ -255,13 +239,13 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T HIPStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); T sum = 0.0; - for (int i = 0; i < block_count; i++) + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 305e937..3c603e0 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,39 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { -#ifdef __HIP_PLATFORM_NVCC__ - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 1 - #endif -#else - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 4 - #endif -#endif // Make sure that either: - // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element // or - // DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size - // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements // and 4 FP32 elements - static constexpr unsigned int elements_per_lane{ - (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; protected: // Size of arrays int array_size; - int block_count; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 19e6fd0..78150c4 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,8 +2,6 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) - macro(setup) - register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + # nothing to do here as hipcc does everything correctly, what a surprise! endmacro() \ No newline at end of file