diff --git a/JuliaStream.jl/AMDGPU/Manifest.toml b/JuliaStream.jl/AMDGPU/Manifest.toml index 6e27f0a..5d1a8a7 100644 --- a/JuliaStream.jl/AMDGPU/Manifest.toml +++ b/JuliaStream.jl/AMDGPU/Manifest.toml @@ -115,9 +115,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/CUDA/Manifest.toml b/JuliaStream.jl/CUDA/Manifest.toml index 7330228..af0acfc 100644 --- a/JuliaStream.jl/CUDA/Manifest.toml +++ b/JuliaStream.jl/CUDA/Manifest.toml @@ -40,9 +40,9 @@ version = "0.4.1" [[CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "9303b20dfa74e4bcb4da425d351d551fbb5850be" +git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.0" +version = "3.4.1" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] @@ -122,9 +122,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] diff --git a/JuliaStream.jl/KernelAbstractions/Manifest.toml b/JuliaStream.jl/KernelAbstractions/Manifest.toml index 5c24cf5..25fd8d1 100644 --- a/JuliaStream.jl/KernelAbstractions/Manifest.toml +++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -185,9 +185,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] diff --git a/JuliaStream.jl/oneAPI/Manifest.toml b/JuliaStream.jl/oneAPI/Manifest.toml index ca932aa..3aab94b 100644 --- a/JuliaStream.jl/oneAPI/Manifest.toml +++ b/JuliaStream.jl/oneAPI/Manifest.toml @@ -104,9 +104,9 @@ version = "4.2.0" [[LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847" +git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.7+0" +version = "0.0.8+0" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl index 3ed9748..8347637 100644 --- a/JuliaStream.jl/src/AMDGPUStream.jl +++ b/JuliaStream.jl/src/AMDGPUStream.jl @@ -61,7 +61,6 @@ function copy!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c) ) end @@ -73,7 +72,6 @@ function mul!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar) ) end @@ -85,7 +83,6 @@ function add!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c) ) end @@ -97,7 +94,6 @@ function triad!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, @@ -114,7 +110,6 @@ function nstream!(data::ROCData{T}, _) where {T} return end AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = data.size kernel( data.a, data.b, @@ -155,7 +150,6 @@ function dot(data::ROCData{T}, _) where {T} end partial_sum = ROCArray{T}(undef, DotBlocks) AMDGPU.wait( - soft = false, # soft wait causes HSA_REFCOUNT overflow issues @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel( data.a, data.b, diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl index dd4fc44..b46b3c9 100644 --- a/JuliaStream.jl/src/CUDAStream.jl +++ b/JuliaStream.jl/src/CUDAStream.jl @@ -21,7 +21,6 @@ function make_stream( error("arraysize ($(arraysize)) must be divisible by $(TBSize)!") end - # so CUDA's device is 0 indexed, so -1 from Julia CUDA.device!(device[1]) selected = CUDA.device() # show_reason is set to true here so it dumps CUDA info @@ -46,14 +45,14 @@ function make_stream( end function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T} - CUDA.fill!(data.a, init[1]) - CUDA.fill!(data.b, init[2]) - CUDA.fill!(data.c, init[3]) + fill!(data.a, init[1]) + fill!(data.b, init[2]) + fill!(data.c, init[3]) end function copy!(data::CuData{T}, _) where {T} function kernel(a, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] return end @@ -63,7 +62,7 @@ end function mul!(data::CuData{T}, _) where {T} function kernel(b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds b[i] = scalar * c[i] return end @@ -73,7 +72,7 @@ end function add!(data::CuData{T}, _) where {T} function kernel(a, b, c) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds c[i] = a[i] + b[i] return end @@ -83,7 +82,7 @@ end function triad!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] = b[i] + (scalar * c[i]) return end @@ -98,7 +97,7 @@ end function nstream!(data::CuData{T}, _) where {T} function kernel(a, b, c, scalar) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds a[i] += b[i] + scalar * c[i] return end @@ -119,7 +118,7 @@ function dot(data::CuData{T}, _) where {T} @inbounds tb_sum[local_i] = 0.0 # do dot first - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1 + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x while i <= size @inbounds tb_sum[local_i] += a[i] * b[i] i += blockDim().x * gridDim().x @@ -143,7 +142,6 @@ function dot(data::CuData{T}, _) where {T} end partial_sum = CuArray{T}(undef, DotBlocks) @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum) - CUDA.synchronize() return sum(partial_sum) end