Address CUDA comments

Drop soft=false for AMDGPU as this option was removed Update dependencies
2021-08-18 02:00:50 +01:00 · 2021-08-18 02:00:50 +01:00 · c445b64690
commit c445b64690
parent bb271dd046
6 changed files with 19 additions and 27 deletions
--- a/JuliaStream.jl/AMDGPU/Manifest.toml
+++ b/JuliaStream.jl/AMDGPU/Manifest.toml
@ -115,9 +115,9 @@ version = "4.2.0"

 [[LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
+git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.7+0"
+version = "0.0.8+0"

 [[LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
--- a/JuliaStream.jl/CUDA/Manifest.toml
+++ b/JuliaStream.jl/CUDA/Manifest.toml
@ -40,9 +40,9 @@ version = "0.4.1"

 [[CUDA]]
 deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
-git-tree-sha1 = "9303b20dfa74e4bcb4da425d351d551fbb5850be"
+git-tree-sha1 = "c583f3ccdce071b8a8bce9bf3d5d5409eaf36d2b"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "3.4.0"
+version = "3.4.1"

 [[ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
@ -122,9 +122,9 @@ version = "4.2.0"

 [[LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
+git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.7+0"
+version = "0.0.8+0"

 [[LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
--- a/JuliaStream.jl/KernelAbstractions/Manifest.toml
+++ b/JuliaStream.jl/KernelAbstractions/Manifest.toml
@ -185,9 +185,9 @@ version = "4.2.0"

 [[LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
+git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.7+0"
+version = "0.0.8+0"

 [[LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
--- a/JuliaStream.jl/oneAPI/Manifest.toml
+++ b/JuliaStream.jl/oneAPI/Manifest.toml
@ -104,9 +104,9 @@ version = "4.2.0"

 [[LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a9b1130c4728b0e462a1c28772954650039eb847"
+git-tree-sha1 = "2d5a0044d6505f4771b5c82de87393f0c9741154"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.7+0"
+version = "0.0.8+0"

 [[LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
--- a/JuliaStream.jl/src/AMDGPUStream.jl
+++ b/JuliaStream.jl/src/AMDGPUStream.jl
@ -61,7 +61,6 @@ function copy!(data::ROCData{T}, _) where {T}
    return
  end
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.c)
  )
 end
@ -73,7 +72,6 @@ function mul!(data::ROCData{T}, _) where {T}
    return
  end
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = data.size kernel(data.b, data.c, data.scalar)
  )
 end
@ -85,7 +83,6 @@ function add!(data::ROCData{T}, _) where {T}
    return
  end
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = data.size kernel(data.a, data.b, data.c)
  )
 end
@ -97,7 +94,6 @@ function triad!(data::ROCData{T}, _) where {T}
    return
  end
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = data.size kernel(
      data.a,
      data.b,
@ -114,7 +110,6 @@ function nstream!(data::ROCData{T}, _) where {T}
    return
  end
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = data.size kernel(
      data.a,
      data.b,
@ -155,7 +150,6 @@ function dot(data::ROCData{T}, _) where {T}
  end
  partial_sum = ROCArray{T}(undef, DotBlocks)
  AMDGPU.wait(
-    soft = false, # soft wait causes HSA_REFCOUNT overflow issues
    @roc groupsize = TBSize gridsize = TBSize * DotBlocks kernel(
      data.a,
      data.b,
--- a/JuliaStream.jl/src/CUDAStream.jl
+++ b/JuliaStream.jl/src/CUDAStream.jl
@ -21,7 +21,6 @@ function make_stream(
    error("arraysize ($(arraysize)) must be divisible by $(TBSize)!")
  end

-  # so CUDA's device is 0 indexed, so -1 from Julia
  CUDA.device!(device[1])
  selected = CUDA.device()
  # show_reason is set to true here so it dumps CUDA info 
@ -46,14 +45,14 @@ function make_stream(
 end

 function init_arrays!(data::CuData{T}, _, init::Tuple{T,T,T}) where {T}
-  CUDA.fill!(data.a, init[1])
-  CUDA.fill!(data.b, init[2])
-  CUDA.fill!(data.c, init[3])
+  fill!(data.a, init[1])
+  fill!(data.b, init[2])
+  fill!(data.c, init[3])
 end

 function copy!(data::CuData{T}, _) where {T}
  function kernel(a, c)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i]
    return
  end
@ -63,7 +62,7 @@ end

 function mul!(data::CuData{T}, _) where {T}
  function kernel(b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds b[i] = scalar * c[i]
    return
  end
@ -73,7 +72,7 @@ end

 function add!(data::CuData{T}, _) where {T}
  function kernel(a, b, c)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] + b[i]
    return
  end
@ -83,7 +82,7 @@ end

 function triad!(data::CuData{T}, _) where {T}
  function kernel(a, b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds a[i] = b[i] + (scalar * c[i])
    return
  end
@ -98,7 +97,7 @@ end

 function nstream!(data::CuData{T}, _) where {T}
  function kernel(a, b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    @inbounds a[i] += b[i] + scalar * c[i]
    return
  end
@ -119,7 +118,7 @@ function dot(data::CuData{T}, _) where {T}
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += blockDim().x * gridDim().x
@ -143,7 +142,6 @@ function dot(data::CuData{T}, _) where {T}
  end
  partial_sum = CuArray{T}(undef, DotBlocks)
  @cuda blocks = DotBlocks threads = TBSize kernel(data.a, data.b, data.size, partial_sum)
-  CUDA.synchronize()
  return sum(partial_sum)
 end