152 lines
4.1 KiB
Julia
152 lines
4.1 KiB
Julia
include("Stream.jl")
|
|
using CUDA
|
|
|
|
const CuData = StreamData{T,CuArray{T}} where {T}
|
|
const TBSize = 1024::Int
|
|
const DotBlocks = 256::Int
|
|
|
|
function devices()::Vector{DeviceWithRepr}
|
|
return !CUDA.functional(false) ? String[] :
|
|
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices())
|
|
end
|
|
|
|
function make_stream(
|
|
arraysize::Int,
|
|
scalar::T,
|
|
device::DeviceWithRepr,
|
|
silent::Bool,
|
|
)::Tuple{CuData{T},Nothing} where {T}
|
|
|
|
if arraysize % TBSize != 0
|
|
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
|
|
end
|
|
|
|
CUDA.device!(device[1])
|
|
selected = CUDA.device()
|
|
# show_reason is set to true here so it dumps CUDA info
|
|
# for us regardless of whether it's functional
|
|
if !CUDA.functional(true)
|
|
error("Non-functional CUDA configuration")
|
|
end
|
|
if !silent
|
|
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
|
|
println("Kernel parameters: <<<$(arraysize ÷ TBSize),$(TBSize)>>>")
|
|
end
|
|
return (
|
|
CuData{T}(
|
|
CuArray{T}(undef, arraysize),
|
|
CuArray{T}(undef, arraysize),
|
|
CuArray{T}(undef, arraysize),
|
|
scalar,
|
|
arraysize,
|
|
),
|
|
nothing,
|
|
)
|
|
end
|
|
|
|
function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
|
|
fill!(data.a, init[1])
|
|
fill!(data.b, init[2])
|
|
fill!(data.c, init[3])
|
|
end
|
|
|
|
function copy!(data::CuData{T}, _) where {T}
|
|
function kernel(a::AbstractArray{T}, c::AbstractArray{T})
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
@inbounds c[i] = a[i]
|
|
return
|
|
end
|
|
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.c)
|
|
CUDA.synchronize()
|
|
end
|
|
|
|
function mul!(data::CuData{T}, _) where {T}
|
|
function kernel(b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
@inbounds b[i] = scalar * c[i]
|
|
return
|
|
end
|
|
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.b, data.c, data.scalar)
|
|
CUDA.synchronize()
|
|
end
|
|
|
|
function add!(data::CuData{T}, _) where {T}
|
|
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T})
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
@inbounds c[i] = a[i] + b[i]
|
|
return
|
|
end
|
|
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(data.a, data.b, data.c)
|
|
CUDA.synchronize()
|
|
end
|
|
|
|
function triad!(data::CuData{T}, _) where {T}
|
|
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
@inbounds a[i] = b[i] + (scalar * c[i])
|
|
return
|
|
end
|
|
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
|
|
data.a,
|
|
data.b,
|
|
data.c,
|
|
data.scalar,
|
|
)
|
|
CUDA.synchronize()
|
|
end
|
|
|
|
function nstream!(data::CuData{T}, _) where {T}
|
|
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, c::AbstractArray{T}, scalar::T)
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
@inbounds a[i] += b[i] + scalar * c[i]
|
|
return
|
|
end
|
|
@cuda blocks = data.size ÷ TBSize threads = TBSize kernel(
|
|
data.a,
|
|
data.b,
|
|
data.c,
|
|
data.scalar,
|
|
)
|
|
CUDA.synchronize()
|
|
end
|
|
|
|
function dot(data::CuData{T}, _) where {T}
|
|
# direct port of the reduction in CUDAStream.cu
|
|
function kernel(a::AbstractArray{T}, b::AbstractArray{T}, size::Int, partial::AbstractArray{T})
|
|
tb_sum = @cuStaticSharedMem(T, TBSize)
|
|
local_i = threadIdx().x
|
|
@inbounds tb_sum[local_i] = 0.0
|
|
|
|
# do dot first
|
|
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
|
while i <= size
|
|
@inbounds tb_sum[local_i] += a[i] * b[i]
|
|
i += blockDim().x * gridDim().x
|
|
end
|
|
|
|
# then tree reduction
|
|
offset = blockDim().x ÷ 2
|
|
while offset > 0
|
|
sync_threads()
|
|
if (local_i - 1) < offset
|
|
@inbounds tb_sum[local_i] += tb_sum[local_i+offset]
|
|
end
|
|
offset ÷= 2
|
|
end
|
|
|
|
if (local_i == 1)
|
|
@inbounds partial[blockIdx().x] = tb_sum[local_i]
|
|
end
|
|
|
|
return
|
|
end
|
|
partial_sum = CuArray{T}(undef, DotBlocks)
|
|
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
|
|
return sum(partial_sum)
|
|
end
|
|
|
|
function read_data(data::CuData{T}, _)::VectorData{T} where {T}
|
|
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
|
|
end
|
|
|
|
main() |