diff --git a/CUDAStream.cu b/CUDAStream.cu index 32aae49..b467d00 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -212,6 +212,23 @@ void CUDAStream::triad() check_error(); } +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] += b[i] + scalar * c[i]; +} + +template +void CUDAStream::nstream() +{ + nstream_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { diff --git a/CUDAStream.h b/CUDAStream.h index df85802..83b8c66 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -50,6 +50,7 @@ class CUDAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual void nstream() override; virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override;