Inline blocks in CUDAStream

This commit is contained in:
Tom Lin 2021-06-30 19:20:37 +01:00
parent 418315543c
commit 2e957d3f60

View File

@ -10,10 +10,6 @@ function devices()
map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices()) map(d -> "$(CUDA.name(d)) ($(repr(d)))", CUDA.devices())
end end
function blocks(data::CuData{T})::Int where {T}
return data.size ÷ TBSize
end
function make_stream( function make_stream(
arraysize::Int, arraysize::Int,
scalar::T, scalar::T,
@ -42,7 +38,7 @@ function make_stream(
) )
if !silent if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
println("Kernel parameters: <<<$(blocks(data)),$(TBSize)>>>") println("Kernel parameters: <<<$(data.size ÷ TBSize),$(TBSize)>>>")
end end
return data return data
end end
@ -59,7 +55,7 @@ function copy!(data::CuData{T}) where {T}
@inbounds c[i] = a[i] @inbounds c[i] = a[i]
return return
end end
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.c) @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c)
CUDA.synchronize() CUDA.synchronize()
end end
@ -69,7 +65,7 @@ function mul!(data::CuData{T}) where {T}
@inbounds b[i] = scalar * c[i] @inbounds b[i] = scalar * c[i]
return return
end end
@cuda blocks = blocks(data) threads = TBSize kernel(data.b, data.c, data.scalar) @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar)
CUDA.synchronize() CUDA.synchronize()
end end
@ -79,7 +75,7 @@ function add!(data::CuData{T}) where {T}
@inbounds c[i] = a[i] + b[i] @inbounds c[i] = a[i] + b[i]
return return
end end
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c) @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c)
CUDA.synchronize() CUDA.synchronize()
end end
@ -89,7 +85,12 @@ function triad!(data::CuData{T}) where {T}
@inbounds a[i] = b[i] + (scalar * c[i]) @inbounds a[i] = b[i] + (scalar * c[i])
return return
end end
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize() CUDA.synchronize()
end end
@ -99,7 +100,12 @@ function nstream!(data::CuData{T}) where {T}
@inbounds a[i] += b[i] + scalar * c[i] @inbounds a[i] += b[i] + scalar * c[i]
return return
end end
@cuda blocks = blocks(data) threads = TBSize kernel(data.a, data.b, data.c, data.scalar) @cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize() CUDA.synchronize()
end end