diff --git a/JuliaStream.jl/src/AMDGPUStream.jl b/JuliaStream.jl/src/AMDGPUStream.jl
index cb54904..9a9cd9a 100644
--- a/JuliaStream.jl/src/AMDGPUStream.jl
+++ b/JuliaStream.jl/src/AMDGPUStream.jl
@@ -64,7 +64,7 @@ end
 
 function copy!(data::ROCData{T}) where {T}
   function kernel(a, c)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     @inbounds c[i] = a[i]
     return
   end
@@ -76,7 +76,7 @@ end
 
 function mul!(data::ROCData{T}) where {T}
   function kernel(b, c, scalar)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     @inbounds b[i] = scalar * c[i]
     return
   end
@@ -88,7 +88,7 @@ end
 
 function add!(data::ROCData{T}) where {T}
   function kernel(a, b, c)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     @inbounds c[i] = a[i] + b[i]
     return
   end
@@ -100,7 +100,7 @@ end
 
 function triad!(data::ROCData{T}) where {T}
   function kernel(a, b, c, scalar)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     @inbounds a[i] = b[i] + (scalar * c[i])
     return
   end
@@ -117,7 +117,7 @@ end
 
 function nstream!(data::ROCData{T}) where {T}
   function kernel(a, b, c, scalar)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     @inbounds a[i] += b[i] + scalar * c[i]
     return
   end
@@ -139,7 +139,7 @@ function dot(data::ROCData{T}) where {T}
     @inbounds tb_sum[local_i] = 0.0
 
     # do dot first
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x  # only workgroupIdx starts at 1
     while i <= size
       @inbounds tb_sum[local_i] += a[i] * b[i]
       i += TBSize * DotBlocks # XXX don't use (workgroupDim().x * gridDimWG().x) here
diff --git a/JuliaStream.jl/src/CUDAStream.jl b/JuliaStream.jl/src/CUDAStream.jl
index dbf21d7..7d671a5 100644
--- a/JuliaStream.jl/src/CUDAStream.jl
+++ b/JuliaStream.jl/src/CUDAStream.jl
@@ -51,7 +51,7 @@ end
 
 function copy!(data::CuData{T}) where {T}
   function kernel(a, c)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     @inbounds c[i] = a[i]
     return
   end
@@ -61,7 +61,7 @@ end
 
 function mul!(data::CuData{T}) where {T}
   function kernel(b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     @inbounds b[i] = scalar * c[i]
     return
   end
@@ -71,7 +71,7 @@ end
 
 function add!(data::CuData{T}) where {T}
   function kernel(a, b, c)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     @inbounds c[i] = a[i] + b[i]
     return
   end
@@ -81,7 +81,7 @@ end
 
 function triad!(data::CuData{T}) where {T}
   function kernel(a, b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     @inbounds a[i] = b[i] + (scalar * c[i])
     return
   end
@@ -96,7 +96,7 @@ end
 
 function nstream!(data::CuData{T}) where {T}
   function kernel(a, b, c, scalar)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     @inbounds a[i] += b[i] + scalar * c[i]
     return
   end
@@ -117,7 +117,7 @@ function dot(data::CuData{T}) where {T}
     @inbounds tb_sum[local_i] = 0.0
 
     # do dot first
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # only blockIdx starts at 1
     while i <= size
       @inbounds tb_sum[local_i] += a[i] * b[i]
       i += blockDim().x * gridDim().x