Address CUDA comments
Drop soft=false for AMDGPU as this option was removed Update dependencies
This commit is contained in:
parent
bb271dd046
commit
c445b64690
@ -115,9 +115,9 @@ version = "4.2.0"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
|
||||
git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.7+0"
|
||||
version = "0.0.8+0"
|
||||
|
||||
[[LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
|
||||
@ -40,9 +40,9 @@ version = "0.4.1"
|
||||
|
||||
[[CUDA]]
|
||||
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
|
||||
git-tree-sha1 = "9303b20dfa74e4bcb4da425d351d551fbb5850be"
|
||||
git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b"
|
||||
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
version = "3.4.0"
|
||||
version = "3.4.1"
|
||||
|
||||
[[ChainRulesCore]]
|
||||
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
|
||||
@ -122,9 +122,9 @@ version = "4.2.0"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
|
||||
git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.7+0"
|
||||
version = "0.0.8+0"
|
||||
|
||||
[[LazyArtifacts]]
|
||||
deps = ["Artifacts", "Pkg"]
|
||||
|
||||
@ -185,9 +185,9 @@ version = "4.2.0"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
|
||||
git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.7+0"
|
||||
version = "0.0.8+0"
|
||||
|
||||
[[LazyArtifacts]]
|
||||
deps = ["Artifacts", "Pkg"]
|
||||
|
||||
@ -104,9 +104,9 @@ version = "4.2.0"
|
||||
|
||||
[[LLVMExtra_jll]]
|
||||
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
|
||||
git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
|
||||
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
|
||||
version = "0.0.7+0"
|
||||
version = "0.0.8+0"
|
||||
|
||||
[[LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
|
||||
@ -61,7 +61,6 @@ function copy!(data::ROCData{T}, _) where {T}
|
||||
return
|
||||
end
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c)
|
||||
)
|
||||
end
|
||||
@ -73,7 +72,6 @@ function mul!(data::ROCData{T}, _) where {T}
|
||||
return
|
||||
end
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar)
|
||||
)
|
||||
end
|
||||
@ -85,7 +83,6 @@ function add!(data::ROCData{T}, _) where {T}
|
||||
return
|
||||
end
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c)
|
||||
)
|
||||
end
|
||||
@ -97,7 +94,6 @@ function triad!(data::ROCData{T}, _) where {T}
|
||||
return
|
||||
end
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = data.size kernel(
|
||||
data.a,
|
||||
data.b,
|
||||
@ -114,7 +110,6 @@ function nstream!(data::ROCData{T}, _) where {T}
|
||||
return
|
||||
end
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = data.size kernel(
|
||||
data.a,
|
||||
data.b,
|
||||
@ -155,7 +150,6 @@ function dot(data::ROCData{T}, _) where {T}
|
||||
end
|
||||
partial_sum = ROCArray{T}(undef, DotBlocks)
|
||||
AMDGPU.wait(
|
||||
soft = false, # soft wait causes HSA_REFCOUNT overflow issues
|
||||
@roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel(
|
||||
data.a,
|
||||
data.b,
|
||||
|
||||
@ -21,7 +21,6 @@ function make_stream(
|
||||
error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
|
||||
end
|
||||
|
||||
# so CUDA's device is 0 indexed, so -1 from Julia
|
||||
CUDA.device!(device[1])
|
||||
selected = CUDA.device()
|
||||
# show_reason is set to true here so it dumps CUDA info
|
||||
@ -46,14 +45,14 @@ function make_stream(
|
||||
end
|
||||
|
||||
function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
|
||||
CUDA.fill!(data.a, init[1])
|
||||
CUDA.fill!(data.b, init[2])
|
||||
CUDA.fill!(data.c, init[3])
|
||||
fill!(data.a, init[1])
|
||||
fill!(data.b, init[2])
|
||||
fill!(data.c, init[3])
|
||||
end
|
||||
|
||||
function copy!(data::CuData{T}, _) where {T}
|
||||
function kernel(a, c)
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
@inbounds c[i] = a[i]
|
||||
return
|
||||
end
|
||||
@ -63,7 +62,7 @@ end
|
||||
|
||||
function mul!(data::CuData{T}, _) where {T}
|
||||
function kernel(b, c, scalar)
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
@inbounds b[i] = scalar * c[i]
|
||||
return
|
||||
end
|
||||
@ -73,7 +72,7 @@ end
|
||||
|
||||
function add!(data::CuData{T}, _) where {T}
|
||||
function kernel(a, b, c)
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
@inbounds c[i] = a[i] + b[i]
|
||||
return
|
||||
end
|
||||
@ -83,7 +82,7 @@ end
|
||||
|
||||
function triad!(data::CuData{T}, _) where {T}
|
||||
function kernel(a, b, c, scalar)
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
@inbounds a[i] = b[i] + (scalar * c[i])
|
||||
return
|
||||
end
|
||||
@ -98,7 +97,7 @@ end
|
||||
|
||||
function nstream!(data::CuData{T}, _) where {T}
|
||||
function kernel(a, b, c, scalar)
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
@inbounds a[i] += b[i] + scalar * c[i]
|
||||
return
|
||||
end
|
||||
@ -119,7 +118,7 @@ function dot(data::CuData{T}, _) where {T}
|
||||
@inbounds tb_sum[local_i] = 0.0
|
||||
|
||||
# do dot first
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
|
||||
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||
while i <= size
|
||||
@inbounds tb_sum[local_i] += a[i] * b[i]
|
||||
i += blockDim().x * gridDim().x
|
||||
@ -143,7 +142,6 @@ function dot(data::CuData{T}, _) where {T}
|
||||
end
|
||||
partial_sum = CuArray{T}(undef, DotBlocks)
|
||||
@cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
|
||||
CUDA.synchronize()
|
||||
return sum(partial_sum)
|
||||
end
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user