256 lines
6.7 KiB
Julia
256 lines
6.7 KiB
Julia
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
|
|
include("Stream.jl")
|
|
|
|
const CuData = StreamData{T,CUDA.CuArray{T}} where {T}
|
|
const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T}
|
|
|
|
const TBSize = 1024::Int
|
|
const DotBlocks = 256::Int
|
|
|
|
@enum Backend cuda rocm cpu
|
|
|
|
struct Context
|
|
backend::Backend
|
|
device::Device
|
|
end
|
|
|
|
function list_rocm_devices()::Vector{DeviceWithRepr}
|
|
try
|
|
# AMDGPU.agents()'s internal iteration order isn't stable
|
|
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
|
|
map(x -> (x, repr(x), rocm), sorted)
|
|
catch
|
|
# probably unsupported
|
|
[]
|
|
end
|
|
end
|
|
|
|
function list_cuda_devices()::Vector{DeviceWithRepr}
|
|
return !CUDA.functional(false) ? String[] :
|
|
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
|
|
end
|
|
|
|
function devices()::Vector{DeviceWithRepr}
|
|
cudas = list_cuda_devices()
|
|
rocms = list_rocm_devices()
|
|
cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
|
|
vcat(cpus, cudas, rocms)
|
|
end
|
|
|
|
function make_stream(
|
|
arraysize::Int,
|
|
scalar::T,
|
|
device::DeviceWithRepr,
|
|
silent::Bool,
|
|
) where {T}
|
|
|
|
if arraysize % TBSize != 0
|
|
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
|
|
end
|
|
|
|
(selected, _, backend) = device
|
|
if backend == cpu
|
|
if !silent
|
|
println("Using CPU with max $(Threads.nthreads()) threads")
|
|
end
|
|
partialsum = Vector{T}(undef, DotBlocks)
|
|
data = VectorData{T}(
|
|
Vector{T}(undef, arraysize),
|
|
Vector{T}(undef, arraysize),
|
|
Vector{T}(undef, arraysize),
|
|
scalar,
|
|
arraysize,
|
|
)
|
|
backenddevice = CPU()
|
|
elseif backend == cuda
|
|
CUDA.device!(selected)
|
|
if CUDA.device() != selected
|
|
error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())")
|
|
end
|
|
if !CUDA.functional(true)
|
|
error("Non-functional CUDA configuration")
|
|
end
|
|
if !silent
|
|
println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
|
|
end
|
|
partialsum = CuArray{T}(undef, DotBlocks)
|
|
data = CuData{T}(
|
|
CuArray{T}(undef, arraysize),
|
|
CuArray{T}(undef, arraysize),
|
|
CuArray{T}(undef, arraysize),
|
|
scalar,
|
|
arraysize,
|
|
)
|
|
backenddevice = CUDADevice()
|
|
elseif backend == rocm
|
|
AMDGPU.DEFAULT_AGENT[] = selected
|
|
if AMDGPU.get_default_agent() != selected
|
|
error(
|
|
"Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())",
|
|
)
|
|
end
|
|
if !silent
|
|
println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
|
|
end
|
|
partialsum = ROCArray{T}(undef, DotBlocks)
|
|
data = ROCData{T}(
|
|
ROCArray{T}(undef, arraysize),
|
|
ROCArray{T}(undef, arraysize),
|
|
ROCArray{T}(undef, arraysize),
|
|
scalar,
|
|
arraysize,
|
|
)
|
|
backenddevice = ROCDevice()
|
|
else
|
|
error("unsupported backend $(backend)")
|
|
end
|
|
|
|
if !silent
|
|
println("Kernel parameters : <<<$(data.size),$(TBSize)>>>")
|
|
end
|
|
return (data, Context(backend, backenddevice))
|
|
end
|
|
|
|
function init_arrays!(
|
|
data::StreamData{T,C},
|
|
context::Context,
|
|
init::Tuple{T,T,T},
|
|
) where {T,C}
|
|
if context.backend == cpu
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.a[i] = init[1]
|
|
@inbounds data.b[i] = init[2]
|
|
@inbounds data.c[i] = init[3]
|
|
end
|
|
elseif context.backend == cuda
|
|
CUDA.fill!(data.a, init[1])
|
|
CUDA.fill!(data.b, init[2])
|
|
CUDA.fill!(data.c, init[3])
|
|
elseif context.backend == rocm
|
|
AMDGPU.fill!(data.a, init[1])
|
|
AMDGPU.fill!(data.b, init[2])
|
|
AMDGPU.fill!(data.c, init[3])
|
|
else
|
|
error("unsupported backend $(backend)")
|
|
end
|
|
end
|
|
|
|
function copy!(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(@Const(a::AbstractArray{T}), c)
|
|
i = @index(Global)
|
|
@inbounds c[i] = a[i]
|
|
end
|
|
wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size))
|
|
end
|
|
|
|
function mul!(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T)
|
|
i = @index(Global)
|
|
@inbounds b[i] = scalar * c[i]
|
|
end
|
|
wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size))
|
|
end
|
|
|
|
function add!(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c)
|
|
i = @index(Global)
|
|
@inbounds c[i] = a[i] + b[i]
|
|
end
|
|
wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size))
|
|
end
|
|
|
|
function triad!(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
|
|
i = @index(Global)
|
|
@inbounds a[i] = b[i] + (scalar * c[i])
|
|
end
|
|
wait(
|
|
kernel(context.device, TBSize)(
|
|
data.a,
|
|
data.b,
|
|
data.c,
|
|
data.scalar,
|
|
ndrange = data.size,
|
|
),
|
|
)
|
|
end
|
|
|
|
function nstream!(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
|
|
i = @index(Global)
|
|
@inbounds a[i] += b[i] + scalar * c[i]
|
|
end
|
|
wait(
|
|
kernel(context.device, TBSize)(
|
|
data.a,
|
|
data.b,
|
|
data.c,
|
|
data.scalar,
|
|
ndrange = data.size,
|
|
),
|
|
)
|
|
end
|
|
|
|
function dot(data::StreamData{T,C}, context::Context) where {T,C}
|
|
@kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T})
|
|
local_i = @index(Local)
|
|
group_i = @index(Group)
|
|
tb_sum = @localmem T TBSize
|
|
@inbounds tb_sum[local_i] = 0.0
|
|
|
|
# do dot first
|
|
i = @index(Global)
|
|
while i <= size
|
|
@inbounds tb_sum[local_i] += a[i] * b[i]
|
|
i += TBSize * DotBlocks
|
|
end
|
|
|
|
# then tree reduction
|
|
# FIXME this does not compile when targeting CPUs:
|
|
# see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262
|
|
offset = @private Int64 (1,)
|
|
@inbounds begin
|
|
offset[1] = @groupsize()[1] ÷ 2
|
|
while offset[1] > 0
|
|
@synchronize
|
|
if (local_i - 1) < offset[1]
|
|
tb_sum[local_i] += tb_sum[local_i+offset[1]]
|
|
end
|
|
offset[1] ÷= 2
|
|
end
|
|
end
|
|
|
|
if (local_i == 1)
|
|
@inbounds partial[group_i] = tb_sum[local_i]
|
|
end
|
|
end
|
|
|
|
if context.backend == cpu
|
|
partial_sum = Vector{T}(undef, DotBlocks)
|
|
elseif context.backend == cuda
|
|
partial_sum = CuArray{T}(undef, DotBlocks)
|
|
elseif context.backend == rocm
|
|
partial_sum = ROCArray{T}(undef, DotBlocks)
|
|
else
|
|
error("unsupported backend $(backend)")
|
|
end
|
|
|
|
wait(
|
|
kernel(context.device, TBSize)(
|
|
data.a,
|
|
data.b,
|
|
data.size,
|
|
partial_sum,
|
|
ndrange = TBSize * DotBlocks,
|
|
),
|
|
)
|
|
|
|
return sum(partial_sum)
|
|
end
|
|
|
|
function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C}
|
|
return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
|
|
end
|
|
|
|
main()
|