Merge branch 'develop' into option_for_vec

# Conflicts:
#	src/kokkos/KokkosStream.hpp
#	src/kokkos/model.cmake
This commit is contained in:
Tom Lin 2023-06-28 19:02:24 +01:00
commit 50fe7c102a
40 changed files with 3783 additions and 165 deletions

6
.gitignore vendored
View File

@ -10,12 +10,18 @@ sycl-stream
hip-stream hip-stream
tbb-stream tbb-stream
src/fortran/BabelStream
src/fortran/BabelStream.*
*.o *.o
*.bc *.bc
*.sycl *.sycl
*.tar *.tar
*.gz *.gz
*.a *.a
*.mod
*.cub
*.ptx
KokkosCore_config.* KokkosCore_config.*

View File

@ -2,8 +2,16 @@
All notable changes to this project will be documented in this file. All notable changes to this project will be documented in this file.
## Unreleased ## Unreleased
### Added
- Ability to build Kokkos and RAJA versions against existing packages.
### Changed
- RAJA CUDA CMake build issues resolved.
- Fix CUDA memory limit check. - Fix CUDA memory limit check.
- Use long double for `check_solution` in case of large problem size. - Use long double for `check_solution` in case of large problem size.
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
- Updates to the HIP kernels and API usage.
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
## [v4.0] - 2021-12-22 ## [v4.0] - 2021-12-22

View File

@ -186,7 +186,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F
# setup common build flag defaults if there are no overrides # setup common build flag defaults if there are no overrides
if (NOT DEFINED ${BUILD_TYPE}_FLAGS) if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS}) set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS})
elseif () else ()
set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS}) set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS})
endif () endif ()

View File

@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/<model>`.
Currently available models are: Currently available models are:
``` ```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
``` ```
#### Overriding default flags #### Overriding default flags
@ -163,11 +163,11 @@ Pull Requests should be made against the `develop` branch.
Please cite BabelStream via this reference: Please cite BabelStream via this reference:
Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018. DOI: 10.1504/IJCSE.2018.095847
### Other BabelStream publications ### Other BabelStream publications
* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018.DOI: 10.1504/IJCSE.2018.095847 * Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
* Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. * Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).

2
src/.gitignore vendored
View File

@ -16,6 +16,8 @@
**/*.gz **/*.gz
**/*.a **/*.a
**/*.swp
**/KokkosCore_Config_* **/KokkosCore_Config_*
**/.DS_Store **/.DS_Store

View File

@ -22,7 +22,7 @@
#endif #endif
#define TBSIZE 1024 #define TBSIZE 1024
#define DOT_NUM_BLOCKS 256 #define DOT_NUM_BLOCKS 1024
template <class T> template <class T>
class CUDAStream : public Stream<T> class CUDAStream : public Stream<T>

105
src/fortran/ArrayStream.F90 Normal file
View File

@ -0,0 +1,105 @@
module ArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=5), parameter :: implementation_name = "Array"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
A = initA
B = initB
C = initC
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
h_A = A
h_B = B
h_C = C
end subroutine read_arrays
subroutine copy()
implicit none
C = A
end subroutine copy
subroutine add()
implicit none
C = A + B
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
B = scalar * C
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = B + scalar * C
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = A + B + scalar * C
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
s = dot_product(A,B)
end function dot
end module ArrayStream

View File

@ -0,0 +1,21 @@
module BabelStreamTypes
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
implicit none
#ifdef USE_FLOAT
integer, parameter :: StreamRealKind = REAL32
character(len=6) :: StreamRealName = "REAL32"
#else
integer, parameter :: StreamRealKind = REAL64
character(len=6) :: StreamRealName = "REAL64"
#endif
#ifdef USE_INT32
#warning There is no checking for overflowing INT32, so be careful.
integer, parameter :: StreamIntKind = INT32
#else
integer, parameter :: StreamIntKind = INT64
#endif
end module BabelStreamTypes

View File

@ -0,0 +1,230 @@
module CUDAKernelStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "CUDAKernel"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i) + B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
B(i) = scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer(kind=StreamIntKind) :: i
integer :: err
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAKernelStream

309
src/fortran/CUDAStream.F90 Normal file
View File

@ -0,0 +1,309 @@
module CUDAFortranKernels
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
contains
attributes(global) subroutine do_copy(n,A,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i)
endif
end subroutine do_copy
attributes(global) subroutine do_add(n,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i) + B(i)
endif
end subroutine do_add
attributes(global) subroutine do_mul(n,scalar,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: B(n)
real(kind=REAL64), intent(in) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
B(i) = scalar * C(i)
endif
end subroutine do_mul
attributes(global) subroutine do_triad(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = B(i) + scalar * C(i)
endif
end subroutine do_triad
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(inout) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = A(i) + B(i) + scalar * C(i)
endif
end subroutine do_nstream
#if 0
attributes(global) subroutine do_dot(n,A,B,r)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: r
integer(kind=StreamIntKind) :: i
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
end subroutine do_dot
#endif
end module CUDAFortranKernels
module CUDAStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
use cudafor, only: dim3
implicit none
character(len=4), parameter :: implementation_name = "CUDA"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
type(dim3) :: grid, tblock
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
! move to separate subroutine later
tblock = dim3(128,1,1)
grid = dim3(ceiling(real(N)/tblock%x),1,1)
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use CUDAFortranKernels, only: do_copy
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_copy<<<grid, tblock>>>(N, A, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use CUDAFortranKernels, only: do_add
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_add<<<grid, tblock>>>(N, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use CUDAFortranKernels, only: do_mul
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use CUDAFortranKernels, only: do_triad
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use CUDAFortranKernels, only: do_nstream
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
!use CUDAFortranKernels, only: do_dot
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer :: err
integer(kind=StreamIntKind) :: i
!call do_dot<<<grid, tblock>>>(N, B, C, r)
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAStream

View File

@ -0,0 +1,139 @@
module DoConcurrentStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "DoConcurrent"
integer(kind=StreamIntKind) :: N
#ifdef USE_DEVICE
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N)
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,C)
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(B,C)
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
! reduction omitted because NVF infers it and other compilers do not support
s = real(0,kind=REAL64)
#ifdef CRAY_THREAD_DOCONCURRENT
do i=1,N
#else
do concurrent (i=1:N) !shared(A,B)
#endif
s = s + A(i) * B(i)
end do
end function dot
end module DoConcurrentStream

109
src/fortran/Makefile Normal file
View File

@ -0,0 +1,109 @@
ifeq ($(COMPILER),nvhpc)
include make.inc.nvhpc
else ifeq ($(COMPILER),oneapi)
include make.inc.oneapi
else ifeq ($(COMPILER),gcc)
include make.inc.gcc
else ifeq ($(COMPILER),amd)
include make.inc.amd
else ifeq ($(COMPILER),arm)
include make.inc.arm
else ifeq ($(COMPILER),cray)
include make.inc.cray
else ifeq ($(COMPILER),fj)
include make.inc.fj
else
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
include make.inc.gcc
COMPILER=gcc
endif
FCFLAGS += -DVERSION_STRING="4.0"
#FCFLAGS += -DUSE_INT32
ifeq ($(IMPLEMENTATION),DoConcurrent)
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
else ifeq ($(IMPLEMENTATION),Array)
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
IMPLEMENTATION_OBJECT = ArrayStream.o
else ifeq ($(IMPLEMENTATION),OpenMP)
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPStream.o
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
else ifeq ($(IMPLEMENTATION),OpenACC)
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCStream.o
else ifeq ($(IMPLEMENTATION),OpenACCArray)
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
else ifeq ($(IMPLEMENTATION),CUDA)
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAStream.o
else ifeq ($(IMPLEMENTATION),CUDAKernel)
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
else ifeq ($(IMPLEMENTATION),Sequential)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION_OBJECT = SequentialStream.o
else
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION=Sequential
IMPLEMENTATION_OBJECT = SequentialStream.o
endif
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
$(FC) $(FCFLAGS) -c $<
%.o: %.F90 BabelStreamTypes.mod
$(FC) $(FCFLAGS) -c $<
clean:
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
-rm -f *.modmic *.mod *.o *.cub *.ptx
realclean: clean
-rm -f BabelStream.*

View File

@ -0,0 +1,144 @@
module OpenACCArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenACCArray"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$acc kernels
A = initA
B = initB
C = initC
!$acc end kernels
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$acc kernels
h_A = A
h_B = B
h_C = C
!$acc end kernels
end subroutine read_arrays
subroutine copy()
implicit none
!$acc kernels
C = A
!$acc end kernels
end subroutine copy
subroutine add()
implicit none
!$acc kernels
C = A + B
!$acc end kernels
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
B = scalar * C
!$acc end kernels
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = B + scalar * C
!$acc end kernels
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = A + B + scalar * C
!$acc end kernels
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$acc kernels
s = dot_product(A,B)
!$acc end kernels
end function dot
end module OpenACCArrayStream

View File

@ -0,0 +1,161 @@
module OpenACCStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=7), parameter :: implementation_name = "OpenACC"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$acc parallel loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenACCStream

View File

@ -0,0 +1,137 @@
module OpenMPStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=6), parameter :: implementation_name = "OpenMP"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPStream

View File

@ -0,0 +1,162 @@
module OpenMPTargetLoopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetLoopStream

View File

@ -0,0 +1,163 @@
module OpenMPTargetStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenMPTarget"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams distribute parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i)
end do
!$omp barrier
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams distribute parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetStream

View File

@ -0,0 +1,169 @@
module OpenMPTaskloopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
!$omp end master
!$omp end parallel
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
!$omp end master
!$omp end parallel
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i)
end do
!$omp end master
!$omp end parallel
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i) + B(i)
end do
!$omp end master
!$omp end parallel
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
B(i) = scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel
!$omp master
!$omp taskloop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
!$omp end master
!$omp end parallel
end function dot
end module OpenMPTaskloopStream

View File

@ -0,0 +1,120 @@
module OpenMPWorkshareStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$omp parallel workshare
A = initA
B = initB
C = initC
!$omp end parallel workshare
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$omp parallel workshare
h_A = A
h_B = B
h_C = C
!$omp end parallel workshare
end subroutine read_arrays
subroutine copy()
implicit none
!$omp parallel workshare
C = A
!$omp end parallel workshare
end subroutine copy
subroutine add()
implicit none
!$omp parallel workshare
C = A + B
!$omp end parallel workshare
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
B = scalar * C
!$omp end parallel workshare
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = B + scalar * C
!$omp end parallel workshare
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = A + B + scalar * C
!$omp end parallel workshare
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$omp parallel workshare
s = dot_product(A,B)
!$omp end parallel workshare
end function dot
end module OpenMPWorkshareStream

View File

@ -0,0 +1,130 @@
module SequentialStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "Sequential"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module SequentialStream

54
src/fortran/build.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# uncomment to disable GPU targets
#HAS_GPU=0
# Orin
#if [ "x${compiler}" == "xgcc" ] ; then
# export MCPU=cortex-a78ae
#fi
#if [ "x${compiler}" == "xarm" ] ; then
# export MCPU=cortex-a78
#fi
COMPILERS="gcc"
if [ $(which nvfortran) ] ; then
COMPILERS="${COMPILERS} nvhpc"
fi
if [ $(which crayftn) ] ; then
COMPILERS="${COMPILERS} cray"
fi
if [ $(uname -m) == "aarch64" ] ; then
if [ $(which armflang) ] ; then
COMPILERS="${COMPILERS} arm"
fi
if [ $(which frt) ] ; then
COMPILERS="${COMPILERS} fj"
fi
elif [ $(uname -m) == "x86_64" ] ; then
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
COMPILERS="${COMPILERS} oneapi"
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
. /opt/intel/oneapi/setvars.sh >& /dev/null
fi
else
# ^ this detection can be improved
COMPILERS="${COMPILERS} amd"
fi
fi
for compiler in ${COMPILERS} ; do
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
if [ "${HAS_GPU}" != "0" ] ; then
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
if [ "x${compiler}" == "xnvhpc" ] ; then
TARGETS="${TARGETS} CUDA CUDAKernel"
fi
fi
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
TARGETS="${TARGETS} OpenACC OpenACCArray"
fi
for implementation in ${TARGETS} ; do
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
done
done

666
src/fortran/main.F90 Normal file
View File

@ -0,0 +1,666 @@
module BabelStreamUtil
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
use BabelStreamTypes
implicit none
integer(kind=StreamIntKind) :: array_size = 33554432
integer(kind=StreamIntKind) :: num_times = 100
logical :: mibibytes = .false.
logical :: use_gigs = .false.
logical :: csv = .false.
character(len=1), parameter :: csv_sep = ","
! 1 = All
! 2 = Triad
! 3 = Nstream
integer :: selection = 1
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
contains
function get_wtime() result(t)
#if defined(USE_OMP_GET_WTIME)
use omp_lib
implicit none
real(kind=REAL64) :: t
t = omp_get_wtime()
#elif defined(USE_CPU_TIME)
implicit none
real(kind=REAL64) :: t
real :: r
call cpu_time(r)
t = r
#else
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
#endif
end function get_wtime
subroutine parseArguments()
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream, only: list_devices, set_device
#elif defined(USE_ARRAY)
use ArrayStream, only: list_devices, set_device
#elif defined(USE_OPENMP)
use OpenMPStream, only: list_devices, set_device
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream, only: list_devices, set_device
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream, only: list_devices, set_device
#elif defined(USE_OPENACC)
use OpenACCStream, only: list_devices, set_device
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream, only: list_devices, set_device
#elif defined(USE_CUDA)
use CUDAStream, only: list_devices, set_device
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream, only: list_devices, set_device
#elif defined(USE_SEQUENTIAL)
use SequentialStream, only: list_devices, set_device
#endif
implicit none
integer :: i, argc
integer :: arglen,err,pos(2)
character(len=64) :: argtmp
argc = command_argument_count()
do i=1,argc
call get_command_argument(i,argtmp,arglen,err)
if (err.eq.0) then
!
! list devices
!
pos(1) = index(argtmp,"--list")
if (pos(1).eq.1) then
call list_devices()
stop
endif
!
! set device number
!
pos(1) = index(argtmp,"--device")
if (pos(1).eq.1) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
stop
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer :: dev
read(argtmp,'(i15)') dev
call set_device(dev)
end block
endif
cycle
endif
!
! array size
!
pos(1) = index(argtmp,"--arraysize")
pos(2) = index(argtmp,"-s")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer(kind=INT64) :: big_size
read(argtmp,'(i15)') big_size
if (big_size .gt. HUGE(array_size)) then
print*,'Array size does not fit into integer:'
print*,big_size,'>',HUGE(array_size)
print*,'Stop using USE_INT32'
stop
else
array_size = INT(big_size,kind=StreamIntKind)
endif
end block
endif
cycle
endif
!
! number of iterations
!
pos(1) = index(argtmp,"--numtimes")
pos(2) = index(argtmp,"-n")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
read(argtmp,'(i15)') num_times
if (num_times.lt.2) then
write(*,'(a)') "Number of times must be 2 or more"
stop
end if
endif
cycle
endif
!
! precision
!
pos(1) = index(argtmp,"--float")
if (pos(1).eq.1) then
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
"to run BabelStream in single precision."
stop
endif
!
! selection (All, Triad, Nstream)
!
pos(1) = index(argtmp,"--triad-only")
if (pos(1).eq.1) then
selection = 2
cycle
endif
pos(1) = index(argtmp,"--nstream-only")
if (pos(1).eq.1) then
selection = 3
cycle
endif
!
! CSV
!
pos(1) = index(argtmp,"--csv")
if (pos(1).eq.1) then
csv = .true.
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
!stop
endif
!
! units
!
pos(1) = index(argtmp,"--mibibytes")
if (pos(1).eq.1) then
mibibytes = .true.
cycle
endif
!
! giga/gibi instead of mega/mebi
!
pos(1) = index(argtmp,"--gigs")
if (pos(1).eq.1) then
use_gigs = .true.
cycle
endif
!
!
!
pos(1) = index(argtmp,"--compiler-info")
if (pos(1).eq.1) then
write(*,'(a)') 'Compiler version: ',compiler_version()
write(*,'(a)') 'Compiler options: ',compiler_options()
stop
endif
!
! help
!
pos(1) = index(argtmp,"--help")
pos(2) = index(argtmp,"-h")
if (any(pos(:).eq.1) ) then
call get_command_argument(0,argtmp,arglen,err)
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
write(*,'(a)') "Options:"
write(*,'(a)') " -h --help Print the message"
write(*,'(a)') " --list List available devices"
write(*,'(a)') " --device INDEX Select device at INDEX"
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
!write(*,'(a)') " --float Use floats (rather than doubles)"
write(*,'(a)') " --triad-only Only run triad"
write(*,'(a)') " --nstream-only Only run nstream"
write(*,'(a)') " --csv Output as csv table"
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
stop
endif
end if
end do
end subroutine parseArguments
subroutine run_all(timings, summ)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64), intent(out) :: summ
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call copy()
t2 = get_wtime()
timings(1,i) = t2-t1
t1 = get_wtime()
call mul(startScalar)
t2 = get_wtime()
timings(2,i) = t2-t1
t1 = get_wtime()
call add()
t2 = get_wtime()
timings(3,i) = t2-t1
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(4,i) = t2-t1
t1 = get_wtime()
summ = dot()
t2 = get_wtime()
timings(5,i) = t2-t1
end do
end subroutine run_all
subroutine run_triad(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_triad
subroutine run_nstream(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call nstream(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_nstream
subroutine check_solution(A, B, C, summ)
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
implicit none
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
real(kind=REAL64), intent(in) :: summ
integer(kind=StreamIntKind) :: i
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
real(kind=REAL64) :: scalar
! always use double because of accumulation error
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
logical :: cleanA, cleanB, cleanC, cleanSum
goldA = startA
goldB = startB
goldC = startC
goldSum = 0.0d0
scalar = startScalar
do i=1,num_times
if (selection.eq.1) then
goldC = goldA
goldB = scalar * goldC
goldC = goldA + goldB
goldA = goldB + scalar * goldC
else if (selection.eq.2) then
goldA = goldB + scalar * goldC
else if (selection.eq.3) then
goldA = goldA + goldB + scalar * goldC;
endif
end do
goldSum = goldA * goldB * array_size
cleanA = ALL(IEEE_Is_Normal(A))
cleanB = ALL(IEEE_Is_Normal(B))
cleanC = ALL(IEEE_Is_Normal(C))
cleanSum = IEEE_Is_Normal(summ)
if (.not. cleanA) then
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanB) then
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanC) then
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanSum) then
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
end if
errA = SUM( ABS( A - goldA ) ) / array_size
errB = SUM( ABS( B - goldB ) ) / array_size
errC = SUM( ABS( C - goldC ) ) / array_size
errSum = ABS( (summ - goldSum) / goldSum)
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
if (errA .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
end if
if (errB .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
end if
if (errC .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
end if
if (selection.eq.1) then
if (errSum .gt. 1.0e-8) then
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
end if
endif
end subroutine check_solution
end module BabelStreamUtil
program BabelStream
use BabelStreamUtil
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
integer :: element_size, err
real(kind=REAL64) :: scaling
character(len=3) :: label
real(kind=REAL64), allocatable :: timings(:,:)
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
real(kind=REAL64) :: summ
call parseArguments()
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
if (mibibytes) then
if (use_gigs) then
scaling = 2.0d0**(-30)
label = "GiB"
else
scaling = 2.0d0**(-20)
label = "MiB"
endif
else
if (use_gigs) then
scaling = 1.0d-9
label = "GB"
else
scaling = 1.0d-6
label = "MB"
endif
endif
if (.not.csv) then
write(*,'(a)') "BabelStream Fortran"
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
write(*,'(a16,a)') "Implementation: ", implementation_name
block
character(len=32) :: printout
write(printout,'(i9,1x,a5)') num_times,'times'
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
end block
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
endif ! csv
allocate( timings(5,num_times) )
call alloc(array_size)
call init_arrays(startA, startB, startC)
summ = 0.0d0
timings = -1.0d0
if (selection.eq.1) then
call run_all(timings, summ)
else if (selection.eq.2) then
call run_triad(timings)
else if (selection.eq.3) then
call run_nstream(timings)
endif
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
call read_arrays(h_A, h_B, h_C)
call check_solution(h_A, h_B, h_C, summ)
block
character(len=20) :: printout(8)
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
if (csv) then
write(*,'(a,a1)',advance='no') 'function', csv_sep
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
if (mibibytes) then
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
else
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
endif
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
write(*,'(a,a1)',advance='yes') 'avg_runtime'
else
write(printout(1),'(a8)') 'Function'
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
write(printout(3),'(a9)') 'Min (sec)'
write(printout(4),'(a3)') 'Max'
write(printout(5),'(a7)') 'Average'
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif ! csv
if (selection.eq.1) then
block
integer, parameter :: sizes(5) = [2,2,3,3,2]
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
integer :: i
do i=1,5
tmin = MINVAL(timings(i,2:num_times))
tmax = MAXVAL(timings(i,2:num_times))
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
write(printout(1),'(a)') labels(i)
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
enddo
end block
else if ((selection.eq.2).or.(selection.eq.3)) then
tmin = MINVAL(timings(1,2:num_times))
tmax = MAXVAL(timings(1,2:num_times))
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
if (selection.eq.2) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
write(printout(1),'(a12)') "Triad"
else if (selection.eq.3) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
write(printout(1),'(a12)') "Nstream"
endif
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
endif
end block
call dealloc()
end program BabelStream

25
src/fortran/make.inc.amd Normal file
View File

@ -0,0 +1,25 @@
FC := /opt/rocm/llvm/bin/flang
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
FCFLAGS := -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
ARRAY_FLAG = -fopenmp # libomp.so required
OPENMP_FLAG = -fopenmp
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

39
src/fortran/make.inc.arm Normal file
View File

@ -0,0 +1,39 @@
FC = armflang
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -fopenmp
ARRAY_FLAG = -fopenmp
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

18
src/fortran/make.inc.cray Normal file
View File

@ -0,0 +1,18 @@
FC := ftn
FCFLAGS = -e F -O3
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
ARRAY_FLAG = -h autothread
OPENMP_FLAG = -h omp
OPENACC_FLAG = -h acc
# CPU only
OPENACC_FLAG += -h omp
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

21
src/fortran/make.inc.fj Normal file
View File

@ -0,0 +1,21 @@
FC := frt
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
ARRAY_FLAG = -Kparallel,reduction
OPENMP_FLAG = -fopenmp
OPENACC_FLAG =
# CPU only
OPENACC_FLAG +=
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OPENACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

33
src/fortran/make.inc.gcc Normal file
View File

@ -0,0 +1,33 @@
FC = gfortran
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
ARRAY_FLAG =
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

View File

@ -0,0 +1,70 @@
FC := nvfortran
#FCFLAGS := -O3 -Minform=inform -Minfo=all
FCFLAGS := -O3 -Minform=warn
#TARGET=gpu
TARGET=multicore
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
$(info Ampere detected)
GPU = cc80
endif
ifeq ($(findstring Turing,$(NVARCH)),Turing)
$(info Turing detected)
GPU = cc75
endif
ifeq ($(findstring Volta,$(NVARCH)),Volta)
$(info Volta detected)
GPU = cc70
endif
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
$(info Pascal detected)
GPU = cc60,cc61
endif
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
$(info Jetson AGX Orin detected)
GPU = ccn87,cc86
# figure out Xavier later
#GPU = cc72
endif
ifeq ($(GPU),)
$(error Your GPU architecture could not be detected. Set it manually.)
endif
GPUFLAG = -gpu=$(GPU)
# MARCH=neoverse-v1,neoverse-n1,zen3
ARCH=$(shell uname -m)
ifdef MARCH
ifeq ($(ARCH),aarch64)
ifeq ($(MARCH),neoverse-n1)
FCFLAGS += -tp=$(MARCH)
else
ifeq ($(MARCH),neoverse-v1)
FCFLAGS += -tp=$(MARCH)
else
FCFLAGS += -tp=native
endif
endif
else
FCFLAGS += -tp=$(MARCH)
endif
else
FCFLAGS += -tp=native
endif
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
# set exactly one of these!
#MANAGED = -DUSE_MANAGED -gpu=managed
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
endif

View File

@ -0,0 +1,32 @@
FC := ifx
FCFLAGS = -std18
FCFLAGS += -Ofast -xHOST
FCFLAGS += -qopt-zmm-usage=low
ifeq ($(FC),ifort)
FCFLAGS += -qopt-streaming-stores=always
PARALLEL = -parallel
endif
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
ARRAY_FLAG = -qopenmp $(PARALLEL)
OPENMP_FLAG = -qopenmp
ifeq ($(FC),ifx)
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
endif
OPENACC_FLAG =
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

35
src/fortran/run.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
cat ./run.sh
if [ `uname -s` == Darwin ] ; then
NUM_HWTHREADS=`sysctl -n hw.ncpu`
MEMORY_BYTES=`sysctl -n hw.memsize`
else
NUM_HWTHREADS=`nproc`
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
fi
M=128
export OMP_NUM_THREADS=8
export OMP_PROC_BIND=close
export OMP_PLACES=threads
export ACC_NUM_CORES=${OMP_NUM_THREADS}
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
#if [ "x$compiler" == "xgcc" ] ; then
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
#fi
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
if [ -f BabelStream.${compiler}.${implementation} ] ; then
echo "BabelStream.${compiler}.${implementation}"
ldd BabelStream.${compiler}.${implementation}
time $AFFCONTROL \
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
fi
done
done

View File

@ -9,7 +9,7 @@
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#define TBSIZE 1024 #define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
void check_error(void) void check_error(void)
{ {
@ -47,9 +47,15 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
// Allocate the host array for partial sums for dot kernels // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); // This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
check_error();
// Check buffers fit on the device // Check buffers fit on the device
hipDeviceProp_t props; hipDeviceProp_t props;
@ -64,15 +70,14 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
check_error(); check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error(); check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
check_error();
} }
template <class T> template <class T>
HIPStream<T>::~HIPStream() HIPStream<T>::~HIPStream()
{ {
free(sums); hipHostFree(sums);
check_error();
hipFree(d_a); hipFree(d_a);
check_error(); check_error();
@ -80,15 +85,13 @@ HIPStream<T>::~HIPStream()
check_error(); check_error();
hipFree(d_c); hipFree(d_c);
check_error(); check_error();
hipFree(d_sum);
check_error();
} }
template <typename T> template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = initA; a[i] = initA;
b[i] = initB; b[i] = initB;
c[i] = initC; c[i] = initC;
@ -97,7 +100,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
template <class T> template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC) void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -115,18 +118,17 @@ void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
check_error(); check_error();
} }
template <typename T> template <typename T>
__global__ void copy_kernel(const T * a, T * c) __global__ void copy_kernel(const T * a, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i]; c[i] = a[i];
} }
template <class T> template <class T>
void HIPStream<T>::copy() void HIPStream<T>::copy()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -136,14 +138,14 @@ template <typename T>
__global__ void mul_kernel(T * b, const T * c) __global__ void mul_kernel(T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
b[i] = scalar * c[i]; b[i] = scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::mul() void HIPStream<T>::mul()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -152,14 +154,14 @@ void HIPStream<T>::mul()
template <typename T> template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c) __global__ void add_kernel(const T * a, const T * b, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
} }
template <class T> template <class T>
void HIPStream<T>::add() void HIPStream<T>::add()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -169,14 +171,14 @@ template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c) __global__ void triad_kernel(T * a, const T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = b[i] + scalar * c[i]; a[i] = b[i] + scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::triad() void HIPStream<T>::triad()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
@ -186,32 +188,32 @@ template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c) __global__ void nstream_kernel(T * a, const T * b, const T * c)
{ {
const T scalar = startScalar; const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] += b[i] + scalar * c[i]; a[i] += b[i] + scalar * c[i];
} }
template <class T> template <class T>
void HIPStream<T>::nstream() void HIPStream<T>::nstream()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error(); check_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_error(); check_error();
} }
template <class T> template <typename T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{ {
__shared__ T tb_sum[TBSIZE]; __shared__ T tb_sum[TBSIZE];
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t local_i = threadIdx.x;
const size_t local_i = hipThreadIdx_x; size_t i = blockDim.x * blockIdx.x + local_i;
tb_sum[local_i] = 0.0; tb_sum[local_i] = 0.0;
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i]; tb_sum[local_i] += a[i] * b[i];
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
{ {
__syncthreads(); __syncthreads();
if (local_i < offset) if (local_i < offset)
@ -221,20 +223,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
} }
if (local_i == 0) if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i]; sum[blockIdx.x] = tb_sum[local_i];
} }
template <class T> template <class T>
T HIPStream<T>::dot() T HIPStream<T>::dot()
{ {
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
check_error(); check_error();
hipDeviceSynchronize();
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
T sum = 0.0; T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++) for (int i = 0; i < dot_num_blocks; i++)
sum += sums[i]; sum += sums[i];
return sum; return sum;

View File

@ -14,13 +14,31 @@
#include "Stream.h" #include "Stream.h"
#define IMPLEMENTATION_STRING "HIP" #define IMPLEMENTATION_STRING "HIP"
#define DOT_READ_DWORDS_PER_LANE 4
template <class T> template <class T>
class HIPStream : public Stream<T> class HIPStream : public Stream<T>
{ {
// Make sure that either:
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
// or
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
// Take into account the datatype size
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
// and 4 FP32 elements
static constexpr unsigned int dot_elements_per_lane{
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
protected: protected:
// Size of arrays // Size of arrays
int array_size; int array_size;
int dot_num_blocks;
// Host array for partial sums for dot kernel // Host array for partial sums for dot kernel
T *sums; T *sums;
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
T *d_a; T *d_a;
T *d_b; T *d_b;
T *d_c; T *d_c;
T *d_sum;
public: public:

View File

@ -1,4 +1,4 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
// University of Bristol HPC // University of Bristol HPC
// //
// For full license terms please see the LICENSE file distributed with this // For full license terms please see the LICENSE file distributed with this

View File

@ -10,7 +10,6 @@
#include <stdexcept> #include <stdexcept>
#include <Kokkos_Core.hpp> #include <Kokkos_Core.hpp>
#include "Stream.h" #include "Stream.h"
#define IMPLEMENTATION_STRING "Kokkos" #define IMPLEMENTATION_STRING "Kokkos"

View File

@ -1,32 +1,38 @@
register_flag_optional(CMAKE_CXX_COMPILER register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA. "Any CXX compiler that is supported by CMake detection and RAJA.
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++") "c++")
register_flag_required(KOKKOS_IN_TREE register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos. "Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example: Remember to append Kokkos specific flags as well, for example:
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
# compiler vendor and arch specific flags # compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
macro(setup) macro(setup)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
if (EXISTS "${KOKKOS_IN_TREE}") if (EXISTS "${KOKKOS_IN_TREE}")
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos) register_link_library(Kokkos::kokkos)
elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else() else()
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif () endif ()
register_append_compiler_and_arch_specific_cxx_flags( register_append_compiler_and_arch_specific_cxx_flags(
@ -36,5 +42,3 @@ macro(setup)
) )
endmacro() endmacro()

View File

@ -1,18 +1,19 @@
register_flag_optional(CMAKE_CXX_COMPILER register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA. "Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++") "c++")
register_flag_required(RAJA_IN_TREE register_flag_optional(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA. "Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example: Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
") " "")
register_flag_optional(RAJA_IN_PACKAGE
"Use if Raja is part of a package dependency:
Path to installation" "")
register_flag_optional(TARGET register_flag_optional(TARGET
"Target offload device, implemented values are CPU, NVIDIA" "Target offload device, implemented values are CPU, NVIDIA"
@ -76,16 +77,22 @@ macro(setup)
register_link_library(RAJA) register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here: # RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
elseif (EXISTS "${RAJA_IN_PACKAGE}")
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
find_package(RAJA REQUIRED)
register_link_library(RAJA)
else () else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
endif () endif ()
if (ENABLE_CUDA) if (ENABLE_CUDA)
# RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
enable_language(CUDA) enable_language(CUDA)
set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA) set_source_files_properties(src/raja/RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
endif () endif ()

View File

@ -4,13 +4,131 @@ version = 3
[[package]] [[package]]
name = "ansi_term" name = "ansi_term"
version = "0.11.0" version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "async-attributes"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "async-channel"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28"
dependencies = [
"concurrent-queue",
"event-listener",
"futures-core",
]
[[package]]
name = "async-executor"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965"
dependencies = [
"async-task",
"concurrent-queue",
"fastrand",
"futures-lite",
"once_cell",
"slab",
]
[[package]]
name = "async-global-executor"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca"
dependencies = [
"async-channel",
"async-executor",
"async-io",
"async-lock",
"blocking",
"futures-lite",
"once_cell",
]
[[package]]
name = "async-io"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7"
dependencies = [
"autocfg",
"concurrent-queue",
"futures-lite",
"libc",
"log",
"once_cell",
"parking",
"polling",
"slab",
"socket2",
"waker-fn",
"winapi 0.3.9",
]
[[package]]
name = "async-lock"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6"
dependencies = [
"event-listener",
]
[[package]]
name = "async-std"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d"
dependencies = [
"async-attributes",
"async-channel",
"async-global-executor",
"async-io",
"async-lock",
"crossbeam-utils",
"futures-channel",
"futures-core",
"futures-io",
"futures-lite",
"gloo-timers",
"kv-log-macro",
"log",
"memchr",
"once_cell",
"pin-project-lite",
"pin-utils",
"slab",
"wasm-bindgen-futures",
]
[[package]]
name = "async-task"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524"
[[package]]
name = "atomic-waker"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a"
[[package]] [[package]]
name = "atty" name = "atty"
version = "0.2.14" version = "0.2.14"
@ -24,9 +142,9 @@ dependencies = [
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.0.1" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
@ -34,6 +152,38 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "blocking"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc"
dependencies = [
"async-channel",
"async-task",
"atomic-waker",
"fastrand",
"futures-lite",
"once_cell",
]
[[package]]
name = "bumpalo"
version = "3.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d"
[[package]]
name = "cache-padded"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c"
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]] [[package]]
name = "clap" name = "clap"
version = "2.33.3" version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [ dependencies = [
"ansi_term", "ansi_term",
"atty", "atty",
@ -64,6 +214,15 @@ dependencies = [
"crossterm", "crossterm",
] ]
[[package]]
name = "concurrent-queue"
version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c"
dependencies = [
"cache-padded",
]
[[package]] [[package]]
name = "core_affinity" name = "core_affinity"
version = "0.5.10" version = "0.5.10"
@ -78,9 +237,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam" name = "crossbeam"
version = "0.8.1" version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-channel", "crossbeam-channel",
@ -92,9 +251,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-channel" name = "crossbeam-channel"
version = "0.5.1" version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
@ -102,9 +261,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-deque" name = "crossbeam-deque"
version = "0.8.1" version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-epoch", "crossbeam-epoch",
@ -113,22 +272,22 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-epoch" name = "crossbeam-epoch"
version = "0.9.5" version = "0.9.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348"
dependencies = [ dependencies = [
"autocfg",
"cfg-if", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
"lazy_static",
"memoffset", "memoffset",
"scopeguard", "scopeguard",
] ]
[[package]] [[package]]
name = "crossbeam-queue" name = "crossbeam-queue"
version = "0.3.2" version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
@ -136,12 +295,11 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.5" version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"lazy_static",
] ]
[[package]] [[package]]
@ -170,10 +328,157 @@ dependencies = [
] ]
[[package]] [[package]]
name = "either" name = "ctor"
version = "1.6.1" version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "fastrand"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
dependencies = [
"instant",
]
[[package]]
name = "futures"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf"
[[package]]
name = "futures-executor"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68"
[[package]]
name = "futures-lite"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48"
dependencies = [
"fastrand",
"futures-core",
"futures-io",
"memchr",
"parking",
"pin-project-lite",
"waker-fn",
]
[[package]]
name = "futures-macro"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56"
[[package]]
name = "futures-task"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"
[[package]]
name = "futures-timer"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
name = "gloo-timers"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9"
dependencies = [
"futures-channel",
"futures-core",
"js-sys",
"wasm-bindgen",
]
[[package]] [[package]]
name = "heck" name = "heck"
@ -202,6 +507,15 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "js-sys"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47"
dependencies = [
"wasm-bindgen",
]
[[package]] [[package]]
name = "kernel32-sys" name = "kernel32-sys"
version = "0.2.2" version = "0.2.2"
@ -212,6 +526,15 @@ dependencies = [
"winapi-build", "winapi-build",
] ]
[[package]]
name = "kv-log-macro"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
dependencies = [
"log",
]
[[package]] [[package]]
name = "lazy_static" name = "lazy_static"
version = "1.4.0" version = "1.4.0"
@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.108" version = "0.2.134"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.5" version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
dependencies = [ dependencies = [
"autocfg",
"scopeguard", "scopeguard",
] ]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.14" version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"value-bag",
] ]
[[package]] [[package]]
name = "memoffset" name = "memchr"
version = "0.6.4" version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memoffset"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
dependencies = [ dependencies = [
"autocfg", "autocfg",
] ]
@ -275,32 +606,44 @@ dependencies = [
[[package]] [[package]]
name = "ntapi" name = "ntapi"
version = "0.3.6" version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f"
dependencies = [ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.14" version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [ dependencies = [
"autocfg", "autocfg",
] ]
[[package]] [[package]]
name = "num_cpus" name = "num_cpus"
version = "1.13.0" version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
dependencies = [ dependencies = [
"hermit-abi", "hermit-abi",
"libc", "libc",
] ]
[[package]]
name = "once_cell"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
[[package]]
name = "parking"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.11.2" version = "0.11.2"
@ -327,12 +670,29 @@ dependencies = [
] ]
[[package]] [[package]]
name = "pest" name = "pin-project-lite"
version = "2.1.3" version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "polling"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011"
dependencies = [ dependencies = [
"ucd-trie", "autocfg",
"cfg-if",
"libc",
"log",
"wepoll-ffi",
"winapi 0.3.9",
] ]
[[package]] [[package]]
@ -361,27 +721,27 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.32" version = "1.0.46"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
dependencies = [ dependencies = [
"unicode-xid", "unicode-ident",
] ]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.10" version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]] [[package]]
name = "rayon" name = "rayon"
version = "1.5.1" version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"crossbeam-deque", "crossbeam-deque",
@ -391,31 +751,43 @@ dependencies = [
[[package]] [[package]]
name = "rayon-core" name = "rayon-core"
version = "1.9.1" version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
dependencies = [ dependencies = [
"crossbeam-channel", "crossbeam-channel",
"crossbeam-deque", "crossbeam-deque",
"crossbeam-utils", "crossbeam-utils",
"lazy_static",
"num_cpus", "num_cpus",
] ]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.10" version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [ dependencies = [
"bitflags", "bitflags",
] ]
[[package]] [[package]]
name = "rstest" name = "rstest"
version = "0.10.0" version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc"
dependencies = [
"async-std",
"futures",
"futures-timer",
"rstest_macros",
"rustc_version",
]
[[package]]
name = "rstest_macros"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"proc-macro2", "proc-macro2",
@ -426,7 +798,7 @@ dependencies = [
[[package]] [[package]]
name = "rust-stream" name = "rust-stream"
version = "3.4.0" version = "4.0.0"
dependencies = [ dependencies = [
"colour", "colour",
"core_affinity", "core_affinity",
@ -443,18 +815,18 @@ dependencies = [
[[package]] [[package]]
name = "rustc_version" name = "rustc_version"
version = "0.3.3" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
dependencies = [ dependencies = [
"semver", "semver",
] ]
[[package]] [[package]]
name = "rustversion" name = "rustversion"
version = "1.0.5" version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"
[[package]] [[package]]
name = "scopeguard" name = "scopeguard"
@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]] [[package]]
name = "semver" name = "semver"
version = "0.11.0" version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4"
dependencies = [
"semver-parser",
]
[[package]]
name = "semver-parser"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
dependencies = [
"pest",
]
[[package]] [[package]]
name = "signal-hook" name = "signal-hook"
@ -501,10 +861,29 @@ dependencies = [
] ]
[[package]] [[package]]
name = "smallvec" name = "slab"
version = "1.7.0" version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
dependencies = [
"autocfg",
]
[[package]]
name = "smallvec"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
[[package]]
name = "socket2"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]] [[package]]
name = "structopt" name = "structopt"
version = "0.3.25" version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
dependencies = [ dependencies = [
"clap", "clap",
"lazy_static", "lazy_static",
@ -538,20 +917,20 @@ dependencies = [
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.82" version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"unicode-xid", "unicode-ident",
] ]
[[package]] [[package]]
name = "tabular" name = "tabular"
version = "0.1.4" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1"
dependencies = [ dependencies = [
"unicode-width", "unicode-width",
] ]
@ -566,28 +945,32 @@ dependencies = [
] ]
[[package]] [[package]]
name = "ucd-trie" name = "unicode-ident"
version = "0.1.3" version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
[[package]] [[package]]
name = "unicode-segmentation" name = "unicode-segmentation"
version = "1.8.0" version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.9" version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]] [[package]]
name = "unicode-xid" name = "value-bag"
version = "0.2.2" version = "1.0.0-alpha.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
dependencies = [
"ctor",
"version_check",
]
[[package]] [[package]]
name = "vec_map" name = "vec_map"
@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]] [[package]]
name = "version_check" name = "version_check"
version = "0.9.3" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "waker-fn"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
[[package]]
name = "wasm-bindgen"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d"
dependencies = [
"cfg-if",
"js-sys",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f"
[[package]]
name = "web-sys"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "wepoll-ffi"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "winapi" name = "winapi"

View File

@ -7,19 +7,19 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
num-traits = "0.2.14" num-traits = "0.2.15"
structopt = "0.3.13" structopt = "0.3.26"
tabular = "0.1.4" tabular = "0.2.0"
rayon = "1.5.1" rayon = "1.5.3"
crossbeam = "0.8.1" crossbeam = "0.8.2"
num_cpus = "1.13.0" num_cpus = "1.13.1"
rustversion = "1.0" rustversion = "1.0.9"
libc = "0.2.97" libc = "0.2.134"
core_affinity = "0.5.10" core_affinity = "0.5.10"
colour = "0.6.0" colour = "0.6.0"
[dev-dependencies] [dev-dependencies]
rstest = "0.10.0" rstest = "0.13.0"
[build-dependencies] [build-dependencies]
rustversion = "1.0" rustversion = "1.0"

View File

@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER
"Compile using the specified SYCL compiler implementation "Compile using the specified SYCL compiler implementation
Supported values are Supported values are
ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
ONEAPI-ICPX - icpx as a standalone compiler
DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm)
HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER
register_flag_optional(SYCL_COMPILER_DIR register_flag_optional(SYCL_COMPILER_DIR
"Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first)
ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
"") "")
@ -65,6 +67,12 @@ macro(setup)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP")
set(CMAKE_CXX_COMPILER dpcpp) set(CMAKE_CXX_COMPILER dpcpp)
register_definitions(CL_TARGET_OPENCL_VERSION=220) register_definitions(CL_TARGET_OPENCL_VERSION=220)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
set(CMAKE_CXX_COMPILER icpx)
include_directories(${SYCL_COMPILER_DIR}/include/sycl)
register_definitions(CL_TARGET_OPENCL_VERSION=220)
register_append_cxx_flags(ANY -fsycl)
register_append_link_flags(-fsycl)
else () else ()
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
endif () endif ()

View File

@ -145,7 +145,7 @@ T ThrustStream<T>::dot()
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__)
#define IMPL_FN__(fn) cuda ## fn #define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe #define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__) #elif defined(__HIP_PLATFORM_HCC__)