Use @simd instead of @fastmath for CPU reduction

This commit is contained in:
Tom Lin 2021-08-28 11:39:08 +01:00
parent 41f1767365
commit 78b52a496c

View File

@ -90,14 +90,14 @@ function dot(data::VectorData{T}, _) where {T}
partial = Vector{T}(undef, Threads.nthreads())
static_par_ranged(data.size, Threads.nthreads()) do group, startidx, endidx
acc = zero(T)
@fastmath for i = startidx:endidx
@simd for i = startidx:endidx
@inbounds acc += data.a[i] * data.b[i]
end
@inbounds partial[group] = acc
end
return sum(partial)
# This doesn't do well on aarch64 because of the excessive Threads.threadid() ccall
# and inhibited vectorisation from the lack of @fastmath
# and inhibited vectorisation from the lack of @simd
# partial = zeros(T, Threads.nthreads())
# Threads.@threads for i = 1:data.size
# @inbounds partial[Threads.threadid()] += (data.a[i] * data.b[i])