diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 37fce3b..cc1d21f 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -124,17 +124,19 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector template __launch_bounds__(TBSIZE) __global__ -void copy_kernel(const T * __restrict a, T * __restrict c) +void copy_kernel(const T * a, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -143,18 +145,20 @@ void HIPStream::copy() template __launch_bounds__(TBSIZE) __global__ -void mul_kernel(T * __restrict b, const T * __restrict c) +void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - b[gidx + j] = scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + b[i] = scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -163,17 +167,19 @@ void HIPStream::mul() template __launch_bounds__(TBSIZE) __global__ -void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) +void add_kernel(const T * a, const T * b, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j] + b[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i] + b[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -182,18 +188,20 @@ void HIPStream::add() template __launch_bounds__(TBSIZE) __global__ -void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] = b[i] + scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -220,7 +228,7 @@ void HIPStream::nstream() template __launch_bounds__(TBSIZE) -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE];