BabelStream/JuliaStream.jl/src/CUDAStream.jl
Tom Lin 4853457dca Add type annotation for all kernels
Update dependencies
2021-08-27 14:04:58 +01:00

152 lines
4.1 KiB
Julia

include("Stream.jl")
using CUDA
const CuData = StreamData{T,CuArray{T}} where {T}
const TBSize = 1024::Int
const DotBlocks = 256::Int
function devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? String[] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices())
end
function make_stream(
arraysize::Int,
scalar::T,
device::DeviceWithRepr,
silent::Bool,
)::Tuple{CuData{T},Nothing} where {T}
if arraysize % TBSize != 0
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
end
CUDA.device!(device[1])
selected = CUDA.device()
# show_reason is set to true here so it dumps CUDA info
# for us regardless of whether it's functional
if !CUDA.functional(true)
error("Non-functional CUDA configuration")
end
if !silent
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>")
end
return (
CuData{T}(
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
CuArray{T}(undef, arraysize),
scalar,
arraysize,
),
nothing,
)
end
function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
fill!(data.a, init[1])
fill!(data.b, init[2])
fill!(data.c, init[3])
end
function copy!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c)
CUDA.synchronize()
end
function mul!(data::CuData{T}, _) where {T}
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds b[i] = scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar)
CUDA.synchronize()
end
function add!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds c[i] = a[i] + b[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c)
CUDA.synchronize()
end
function triad!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] = b[i] + (scalar * c[i])
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function nstream!(data::CuData{T}, _) where {T}
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@inbounds a[i] += b[i] + scalar * c[i]
return
end
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
data.a,
data.b,
data.c,
data.scalar,
)
CUDA.synchronize()
end
function dot(data::CuData{T}, _) where {T}
# direct port of the reduction in CUDAStream.cu
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
tb_sum = @cuStaticSharedMem(T, TBSize)
local_i = threadIdx().x
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += blockDim().x * gridDim().x
end
# then tree reduction
offset = blockDim().x ÷ 2
while offset > 0
sync_threads()
if (local_i - 1) < offset
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
end
offset ÷= 2
end
if (local_i == 1)
@inbounds partial[blockIdx().x] = tb_sum[local_i]
end
return
end
partial_sum = CuArray{T}(undef, DotBlocks)
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
return sum(partial_sum)
end
function read_data(data::CuData{T}, _)::VectorData{T} where {T}
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end
main()