using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
include("Stream.jl")

const CuData = StreamData{T,CUDA.CuArray{T}} where {T}
const ROCData = StreamData{T,AMDGPU.ROCArray{T}} where {T}

const TBSize = 1024::Int
const DotBlocks = 256::Int

@enum Backend cuda rocm cpu

struct Context
  backend::Backend
  device::Device
end

function list_rocm_devices()::Vector{DeviceWithRepr}
  try
    # AMDGPU.agents()'s internal iteration order isn't stable
    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
    map(x -> (x, repr(x), rocm), sorted)
  catch
    # probably unsupported
    []
  end
end

function list_cuda_devices()::Vector{DeviceWithRepr}
  return !CUDA.functional(false) ? String[] :
         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end

function devices()::Vector{DeviceWithRepr}
  cudas = list_cuda_devices()
  rocms = list_rocm_devices()
  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
  vcat(cpus, cudas, rocms)
end

function make_stream(
  arraysize::Int,
  scalar::T,
  device::DeviceWithRepr,
  silent::Bool,
) where {T}

  if arraysize % TBSize != 0
    error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
  end

  (selected, _, backend) = device
  if backend == cpu
    if !silent
      println("Using CPU with max $(Threads.nthreads()) threads")
    end
    partialsum = Vector{T}(undef, DotBlocks)
    data = VectorData{T}(
      Vector{T}(undef, arraysize),
      Vector{T}(undef, arraysize),
      Vector{T}(undef, arraysize),
      scalar,
      arraysize,
    )
    backenddevice = CPU()
  elseif backend == cuda
    CUDA.device!(selected)
    if CUDA.device() != selected
      error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())")
    end
    if !CUDA.functional(true)
      error("Non-functional CUDA configuration")
    end
    if !silent
      println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
    end
    partialsum = CuArray{T}(undef, DotBlocks)
    data = CuData{T}(
      CuArray{T}(undef, arraysize),
      CuArray{T}(undef, arraysize),
      CuArray{T}(undef, arraysize),
      scalar,
      arraysize,
    )
    backenddevice = CUDADevice()
  elseif backend == rocm
    AMDGPU.DEFAULT_AGENT[] = selected
    if AMDGPU.get_default_agent() != selected
      error(
        "Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())",
      )
    end
    if !silent
      println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
    end
    partialsum = ROCArray{T}(undef, DotBlocks)
    data = ROCData{T}(
      ROCArray{T}(undef, arraysize),
      ROCArray{T}(undef, arraysize),
      ROCArray{T}(undef, arraysize),
      scalar,
      arraysize,
    )
    backenddevice = ROCDevice()
  else
    error("unsupported backend $(backend)")
  end

  if !silent
    println("Kernel parameters   : <<<$(data.size),$(TBSize)>>>")
  end
  return (data, Context(backend, backenddevice))
end

function init_arrays!(
  data::StreamData{T,C},
  context::Context,
  init::Tuple{T,T,T},
) where {T,C}
  if context.backend == cpu
    Threads.@threads for i = 1:data.size
      @inbounds data.a[i] = init[1]
      @inbounds data.b[i] = init[2]
      @inbounds data.c[i] = init[3]
    end
  elseif context.backend == cuda
    CUDA.fill!(data.a, init[1])
    CUDA.fill!(data.b, init[2])
    CUDA.fill!(data.c, init[3])
  elseif context.backend == rocm
    AMDGPU.fill!(data.a, init[1])
    AMDGPU.fill!(data.b, init[2])
    AMDGPU.fill!(data.c, init[3])
  else
    error("unsupported backend $(backend)")
  end
end

function copy!(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(@Const(a::AbstractArray{T}), c)
    i = @index(Global)
    @inbounds c[i] = a[i]
  end
  wait(kernel(context.device, TBSize)(data.a, data.c, ndrange = data.size))
end

function mul!(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(b::AbstractArray{T}, @Const(c::AbstractArray{T}), scalar::T)
    i = @index(Global)
    @inbounds b[i] = scalar * c[i]
  end
  wait(kernel(context.device, TBSize)(data.b, data.c, data.scalar, ndrange = data.size))
end

function add!(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), c)
    i = @index(Global)
    @inbounds c[i] = a[i] + b[i]
  end
  wait(kernel(context.device, TBSize)(data.a, data.b, data.c, ndrange = data.size))
end

function triad!(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
    i = @index(Global)
    @inbounds a[i] = b[i] + (scalar * c[i])
  end
  wait(
    kernel(context.device, TBSize)(
      data.a,
      data.b,
      data.c,
      data.scalar,
      ndrange = data.size,
    ),
  )
end

function nstream!(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(a::AbstractArray{T}, @Const(b::AbstractArray{T}), @Const(c), scalar::T)
    i = @index(Global)
    @inbounds a[i] += b[i] + scalar * c[i]
  end
  wait(
    kernel(context.device, TBSize)(
      data.a,
      data.b,
      data.c,
      data.scalar,
      ndrange = data.size,
    ),
  )
end

function dot(data::StreamData{T,C}, context::Context) where {T,C}
  @kernel function kernel(@Const(a::AbstractArray{T}), @Const(b::AbstractArray{T}), size::Int, partial::AbstractArray{T})
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end

    # then tree reduction
    # FIXME this does not compile when targeting CPUs:
    # see https://github.com/JuliaGPU/KernelAbstractions.jl/issues/262
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end

    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

  if context.backend == cpu
    partial_sum = Vector{T}(undef, DotBlocks)
  elseif context.backend == cuda
    partial_sum = CuArray{T}(undef, DotBlocks)
  elseif context.backend == rocm
    partial_sum = ROCArray{T}(undef, DotBlocks)
  else
    error("unsupported backend $(backend)")
  end

  wait(
    kernel(context.device, TBSize)(
      data.a,
      data.b,
      data.size,
      partial_sum,
      ndrange = TBSize * DotBlocks,
    ),
  )

  return sum(partial_sum)
end

function read_data(data::StreamData{T,C}, _::Context)::VectorData{T} where {T,C}
  return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size)
end

main()