From 2e957d3f604b2aa3109baada0f132f024dca906e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 30 Jun 2021 19:20:37 +0100 Subject: [PATCH] Inline blocks in CUDAStream --- JuliaStream.jl/src/CUDAStream.jl | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index c4d0510..dbf21d7 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -10,10 +10,6 @@ function devices() map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices()) end -function blocks(data::CuData{T})::Int where {T} - return data.size ÷ TBSize -end - function make_stream( arraysize::Int, scalar::T, @@ -42,7 +38,7 @@ function make_stream( ) if !silent println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") - println("Kernel parameters: <<<$(blocks(data)),$(TBSize)>>>") + println("Kernel parameters: <<<$(data.size ÷ TBSize),$(TBSize)>>>") end return data end @@ -59,7 +55,7 @@ function copy!(data::CuData{T}) where {T} @inbounds c[i] = a[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.c) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c) CUDA.synchronize() end @@ -69,7 +65,7 @@ function mul!(data::CuData{T}) where {T} @inbounds b[i] = scalar * c[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar) CUDA.synchronize() end @@ -79,7 +75,7 @@ function add!(data::CuData{T}) where {T} @inbounds c[i] = a[i] + b[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c) CUDA.synchronize() end @@ -89,7 +85,12 @@ function triad!(data::CuData{T}) where {T} @inbounds a[i] = b[i] + (scalar * c[i]) return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) CUDA.synchronize() end @@ -99,7 +100,12 @@ function nstream!(data::CuData{T}) where {T} @inbounds a[i] += b[i] + scalar * c[i] return end - @cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) + @cuda blocks = data.size ÷ TBSize threads = TBSize kernel( + data.a, + data.b, + data.c, + data.scalar, + ) CUDA.synchronize() end