From bcf8708f2c294187390e69d9b825b2e7dc709001 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Tue, 31 May 2022 11:29:42 -0500 Subject: [PATCH] Clean up kernels and drop unneeded modifications --- src/hip/HIPStream.cpp | 187 ++++++++++++------------------------------ src/hip/HIPStream.h | 7 -- src/hip/model.cmake | 12 +-- 3 files changed, 53 insertions(+), 153 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index dcf634e..eac77b4 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -10,31 +10,6 @@ #define TBSIZE 1024 -#ifdef NONTEMPORAL -template -__device__ __forceinline__ T load(const T& ref) -{ - return __builtin_nontemporal_load(&ref); -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - __builtin_nontemporal_store(value, &ref); -} -#else -template -__device__ __forceinline__ T load(const T& ref) -{ - return ref; -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - ref = value; -} -#endif void check_error(void) { @@ -49,27 +24,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) + block_count(array_size / (TBSIZE * elements_per_lane)) { std::cerr << "Elements per lane: " << elements_per_lane << std::endl; std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) + if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) { std::stringstream ss; ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane * chunks_per_block + << TBSIZE * elements_per_lane << ")."; throw std::runtime_error(ss.str()); } std::cerr << "block count " << block_count << std::endl; -#ifdef NONTEMPORAL - std::cerr << "Using non-temporal memory operations." << std::endl; -#endif - // Set device int count; hipGetDeviceCount(&count); @@ -86,7 +57,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(block_count*sizeof(T)); + hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; @@ -101,15 +73,14 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, block_count*sizeof(T)); - check_error(); } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -117,15 +88,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -152,26 +121,20 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void copy_kernel(const T * __restrict a, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_c); @@ -180,27 +143,21 @@ void HIPStream::copy() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_b, d_c); @@ -209,26 +166,20 @@ void HIPStream::mul() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -237,27 +188,21 @@ void HIPStream::add() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -266,91 +211,63 @@ void HIPStream::triad() check_error(); } -template -__global__ void nstream_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] += b[i] + scalar * c[i]; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -struct Reducer -{ - template - __device__ - static - void reduce(I it) noexcept - { - if (n == 1) return; - -#if defined(__HIP_PLATFORM_NVCC__) - constexpr unsigned int warpSize = 32; -#endif - constexpr bool is_same_warp{n <= warpSize * 2}; - if (static_cast(threadIdx.x) < n/2) - { - it[threadIdx.x] += it[threadIdx.x + n/2]; - } - is_same_warp ? __threadfence_block() : __syncthreads(); - Reducer::reduce(it); - } -}; - -template<> -struct Reducer<1u> { - template - __device__ - static - void reduce(I) noexcept - {} -}; - -template +template __launch_bounds__(TBSIZE) -__global__ -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - const size_t tidx = threadIdx.x; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; - const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - T tmp{0}; - for (size_t i = 0; i != chunks_per_block; ++i) + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; + + tb_sum[local_i] = 0.0; + for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { - for (size_t j = 0; j != elements_per_lane; ++j) + __syncthreads(); + if (local_i < offset) { - tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); + tb_sum[local_i] += tb_sum[local_i+offset]; } } - tb_sum[tidx] = tmp; - __syncthreads(); - Reducer<>::reduce(tb_sum); - if (tidx) return; - store(tb_sum[0], sum[blockIdx.x]); + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(block_count), dim3(TBSIZE), - 0, 0, d_a, d_b, d_sum); + 0, 0, d_a, d_b, sums, array_size); check_error(); - - hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); check_error(); T sum = 0.0; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index ecdf929..7bce0b5 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -22,16 +22,10 @@ class HIPStream : public Stream #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 1 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 8 - #endif #else #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 4 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 1 - #endif #endif // Make sure that either: // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element @@ -41,7 +35,6 @@ class HIPStream : public Stream (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), "DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 3ffaf7a..2f7d69e 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,19 +2,9 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(USE_NONTEMPORAL_MEM - "Flag indicating to use non-temporal memory accesses to bypass cache." - "OFF") - -# TODO: Better flag descriptions -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) -register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) # Ensure we set the proper preprocessor directives - if (USE_NONTEMPORAL_MEM) - add_definitions(-DNONTEMPORAL) - endif () register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) - register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file