112 lines
3.0 KiB
Julia
112 lines
3.0 KiB
Julia
include("Stream.jl")
|
|
|
|
function devices()::Vector{DeviceWithRepr}
|
|
return [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", "Threaded")]
|
|
end
|
|
|
|
function make_stream(
|
|
arraysize::Int,
|
|
scalar::T,
|
|
_::DeviceWithRepr,
|
|
silent::Bool,
|
|
)::Tuple{VectorData{T},Nothing} where {T}
|
|
if !silent
|
|
println("Using max $(Threads.nthreads()) threads")
|
|
end
|
|
return (
|
|
VectorData{T}(
|
|
Vector{T}(undef, arraysize),
|
|
Vector{T}(undef, arraysize),
|
|
Vector{T}(undef, arraysize),
|
|
scalar,
|
|
arraysize,
|
|
),
|
|
nothing
|
|
)
|
|
end
|
|
|
|
function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.a[i] = init[1]
|
|
@inbounds data.b[i] = init[2]
|
|
@inbounds data.c[i] = init[3]
|
|
end
|
|
end
|
|
|
|
function copy!(data::VectorData{T}, _) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.c[i] = data.a[i]
|
|
end
|
|
end
|
|
|
|
function mul!(data::VectorData{T}, _) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.b[i] = data.scalar * data.c[i]
|
|
end
|
|
end
|
|
|
|
function add!(data::VectorData{T}, _) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.c[i] = data.a[i] + data.b[i]
|
|
end
|
|
end
|
|
|
|
function triad!(data::VectorData{T}, _) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i])
|
|
end
|
|
end
|
|
|
|
function nstream!(data::VectorData{T}, _) where {T}
|
|
Threads.@threads for i = 1:data.size
|
|
@inbounds data.a[i] += data.b[i] + data.scalar * data.c[i]
|
|
end
|
|
end
|
|
|
|
# Threads.@threads/Threads.@spawn doesn't support OpenMP's firstprivate, etc
|
|
function static_par_ranged(f::Function, range::Int, n::Int)
|
|
stride = range ÷ n
|
|
rem = range % n
|
|
strides = map(0:n) do i
|
|
width = stride + (i < rem ? 1 : 0)
|
|
offset = i < rem ? (stride + 1) * i : ((stride + 1) * rem) + (stride * (i - rem))
|
|
(offset, width)
|
|
end
|
|
ccall(:jl_enter_threaded_region, Cvoid, ())
|
|
try
|
|
foreach(wait, map(1:n) do group
|
|
(offset, size) = strides[group]
|
|
task = Task(() -> f(group, offset+1, offset+size))
|
|
task.sticky = true
|
|
ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, group-1) # ccall, so 0-based for group
|
|
schedule(task)
|
|
end)
|
|
finally
|
|
ccall(:jl_exit_threaded_region, Cvoid, ())
|
|
end
|
|
end
|
|
|
|
function dot(data::VectorData{T}, _) where {T}
|
|
partial = Vector{T}(undef, Threads.nthreads())
|
|
static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx
|
|
acc = zero(T)
|
|
@simd for i = startidx:endidx
|
|
@inbounds acc += data.a[i] * data.b[i]
|
|
end
|
|
@inbounds partial[group] = acc
|
|
end
|
|
return sum(partial)
|
|
# This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall
|
|
# and inhibited vectorisation from the lack of @simd
|
|
# partial = zeros(T, Threads.nthreads())
|
|
# Threads.@threads for i = 1:data.size
|
|
# @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i])
|
|
# end
|
|
# return sum(partial)
|
|
end
|
|
|
|
function read_data(data::VectorData{T}, _)::VectorData{T} where {T}
|
|
return data
|
|
end
|
|
|
|
main() |