Merge branch 'develop' into option_for_vec

# Conflicts:
#	src/kokkos/KokkosStream.hpp
#	src/kokkos/model.cmake
This commit is contained in:
Tom Lin 2023-06-28 19:02:24 +01:00
commit 50fe7c102a
40 changed files with 3783 additions and 165 deletions

6
.gitignore vendored
View File

@ -10,12 +10,18 @@ sycl-stream
hip-stream
tbb-stream
src/fortran/BabelStream
src/fortran/BabelStream.*
*.o
*.bc
*.sycl
*.tar
*.gz
*.a
*.mod
*.cub
*.ptx
KokkosCore_config.*

View File

@ -2,8 +2,16 @@
All notable changes to this project will be documented in this file.
## Unreleased
### Added
- Ability to build Kokkos and RAJA versions against existing packages.
### Changed
- RAJA CUDA CMake build issues resolved.
- Fix CUDA memory limit check.
- Use long double for `check_solution` in case of large problem size.
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
- Updates to the HIP kernels and API usage.
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
## [v4.0] - 2021-12-22

View File

@ -186,7 +186,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F
# setup common build flag defaults if there are no overrides
if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS})
elseif ()
else ()
set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS})
endif ()

View File

@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/<model>`.
Currently available models are:
```
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
```
#### Overriding default flags
@ -163,11 +163,11 @@ Pull Requests should be made against the `develop` branch.
Please cite BabelStream via this reference:
Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018. DOI: 10.1504/IJCSE.2018.095847
### Other BabelStream publications
* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247262. 2018.DOI: 10.1504/IJCSE.2018.095847
* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
* Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).

2
src/.gitignore vendored
View File

@ -16,6 +16,8 @@
**/*.gz
**/*.a
**/*.swp
**/KokkosCore_Config_*
**/.DS_Store

View File

@ -22,7 +22,7 @@
#endif
#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
#define DOT_NUM_BLOCKS 1024
template <class T>
class CUDAStream : public Stream<T>

105
src/fortran/ArrayStream.F90 Normal file
View File

@ -0,0 +1,105 @@
module ArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=5), parameter :: implementation_name = "Array"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
A = initA
B = initB
C = initC
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
h_A = A
h_B = B
h_C = C
end subroutine read_arrays
subroutine copy()
implicit none
C = A
end subroutine copy
subroutine add()
implicit none
C = A + B
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
B = scalar * C
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = B + scalar * C
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = A + B + scalar * C
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
s = dot_product(A,B)
end function dot
end module ArrayStream

View File

@ -0,0 +1,21 @@
module BabelStreamTypes
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
implicit none
#ifdef USE_FLOAT
integer, parameter :: StreamRealKind = REAL32
character(len=6) :: StreamRealName = "REAL32"
#else
integer, parameter :: StreamRealKind = REAL64
character(len=6) :: StreamRealName = "REAL64"
#endif
#ifdef USE_INT32
#warning There is no checking for overflowing INT32, so be careful.
integer, parameter :: StreamIntKind = INT32
#else
integer, parameter :: StreamIntKind = INT64
#endif
end module BabelStreamTypes

View File

@ -0,0 +1,230 @@
module CUDAKernelStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "CUDAKernel"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i) + B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
B(i) = scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer(kind=StreamIntKind) :: i
integer :: err
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAKernelStream

309
src/fortran/CUDAStream.F90 Normal file
View File

@ -0,0 +1,309 @@
module CUDAFortranKernels
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
contains
attributes(global) subroutine do_copy(n,A,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i)
endif
end subroutine do_copy
attributes(global) subroutine do_add(n,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i) + B(i)
endif
end subroutine do_add
attributes(global) subroutine do_mul(n,scalar,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: B(n)
real(kind=REAL64), intent(in) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
B(i) = scalar * C(i)
endif
end subroutine do_mul
attributes(global) subroutine do_triad(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = B(i) + scalar * C(i)
endif
end subroutine do_triad
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(inout) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = A(i) + B(i) + scalar * C(i)
endif
end subroutine do_nstream
#if 0
attributes(global) subroutine do_dot(n,A,B,r)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: r
integer(kind=StreamIntKind) :: i
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
end subroutine do_dot
#endif
end module CUDAFortranKernels
module CUDAStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
use cudafor, only: dim3
implicit none
character(len=4), parameter :: implementation_name = "CUDA"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
type(dim3) :: grid, tblock
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
! move to separate subroutine later
tblock = dim3(128,1,1)
grid = dim3(ceiling(real(N)/tblock%x),1,1)
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use CUDAFortranKernels, only: do_copy
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_copy<<<grid, tblock>>>(N, A, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use CUDAFortranKernels, only: do_add
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_add<<<grid, tblock>>>(N, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use CUDAFortranKernels, only: do_mul
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use CUDAFortranKernels, only: do_triad
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use CUDAFortranKernels, only: do_nstream
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
!use CUDAFortranKernels, only: do_dot
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer :: err
integer(kind=StreamIntKind) :: i
!call do_dot<<<grid, tblock>>>(N, B, C, r)
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAStream

View File

@ -0,0 +1,139 @@
module DoConcurrentStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "DoConcurrent"
integer(kind=StreamIntKind) :: N
#ifdef USE_DEVICE
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N)
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,C)
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(B,C)
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
! reduction omitted because NVF infers it and other compilers do not support
s = real(0,kind=REAL64)
#ifdef CRAY_THREAD_DOCONCURRENT
do i=1,N
#else
do concurrent (i=1:N) !shared(A,B)
#endif
s = s + A(i) * B(i)
end do
end function dot
end module DoConcurrentStream

109
src/fortran/Makefile Normal file
View File

@ -0,0 +1,109 @@
ifeq ($(COMPILER),nvhpc)
include make.inc.nvhpc
else ifeq ($(COMPILER),oneapi)
include make.inc.oneapi
else ifeq ($(COMPILER),gcc)
include make.inc.gcc
else ifeq ($(COMPILER),amd)
include make.inc.amd
else ifeq ($(COMPILER),arm)
include make.inc.arm
else ifeq ($(COMPILER),cray)
include make.inc.cray
else ifeq ($(COMPILER),fj)
include make.inc.fj
else
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
include make.inc.gcc
COMPILER=gcc
endif
FCFLAGS += -DVERSION_STRING="4.0"
#FCFLAGS += -DUSE_INT32
ifeq ($(IMPLEMENTATION),DoConcurrent)
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
else ifeq ($(IMPLEMENTATION),Array)
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
IMPLEMENTATION_OBJECT = ArrayStream.o
else ifeq ($(IMPLEMENTATION),OpenMP)
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPStream.o
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
else ifeq ($(IMPLEMENTATION),OpenACC)
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCStream.o
else ifeq ($(IMPLEMENTATION),OpenACCArray)
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
else ifeq ($(IMPLEMENTATION),CUDA)
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAStream.o
else ifeq ($(IMPLEMENTATION),CUDAKernel)
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
else ifeq ($(IMPLEMENTATION),Sequential)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION_OBJECT = SequentialStream.o
else
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION=Sequential
IMPLEMENTATION_OBJECT = SequentialStream.o
endif
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
$(FC) $(FCFLAGS) -c $<
%.o: %.F90 BabelStreamTypes.mod
$(FC) $(FCFLAGS) -c $<
clean:
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
-rm -f *.modmic *.mod *.o *.cub *.ptx
realclean: clean
-rm -f BabelStream.*

View File

@ -0,0 +1,144 @@
module OpenACCArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenACCArray"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$acc kernels
A = initA
B = initB
C = initC
!$acc end kernels
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$acc kernels
h_A = A
h_B = B
h_C = C
!$acc end kernels
end subroutine read_arrays
subroutine copy()
implicit none
!$acc kernels
C = A
!$acc end kernels
end subroutine copy
subroutine add()
implicit none
!$acc kernels
C = A + B
!$acc end kernels
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
B = scalar * C
!$acc end kernels
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = B + scalar * C
!$acc end kernels
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = A + B + scalar * C
!$acc end kernels
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$acc kernels
s = dot_product(A,B)
!$acc end kernels
end function dot
end module OpenACCArrayStream

View File

@ -0,0 +1,161 @@
module OpenACCStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=7), parameter :: implementation_name = "OpenACC"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$acc parallel loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenACCStream

View File

@ -0,0 +1,137 @@
module OpenMPStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=6), parameter :: implementation_name = "OpenMP"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPStream

View File

@ -0,0 +1,162 @@
module OpenMPTargetLoopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetLoopStream

View File

@ -0,0 +1,163 @@
module OpenMPTargetStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenMPTarget"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams distribute parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i)
end do
!$omp barrier
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams distribute parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetStream

View File

@ -0,0 +1,169 @@
module OpenMPTaskloopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
!$omp end master
!$omp end parallel
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
!$omp end master
!$omp end parallel
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i)
end do
!$omp end master
!$omp end parallel
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i) + B(i)
end do
!$omp end master
!$omp end parallel
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
B(i) = scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel
!$omp master
!$omp taskloop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
!$omp end master
!$omp end parallel
end function dot
end module OpenMPTaskloopStream

View File

@ -0,0 +1,120 @@
module OpenMPWorkshareStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$omp parallel workshare
A = initA
B = initB
C = initC
!$omp end parallel workshare
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$omp parallel workshare
h_A = A
h_B = B
h_C = C
!$omp end parallel workshare
end subroutine read_arrays
subroutine copy()
implicit none
!$omp parallel workshare
C = A
!$omp end parallel workshare
end subroutine copy
subroutine add()
implicit none
!$omp parallel workshare
C = A + B
!$omp end parallel workshare
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
B = scalar * C
!$omp end parallel workshare
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = B + scalar * C
!$omp end parallel workshare
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = A + B + scalar * C
!$omp end parallel workshare
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$omp parallel workshare
s = dot_product(A,B)
!$omp end parallel workshare
end function dot
end module OpenMPWorkshareStream

View File

@ -0,0 +1,130 @@
module SequentialStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "Sequential"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module SequentialStream

54
src/fortran/build.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# uncomment to disable GPU targets
#HAS_GPU=0
# Orin
#if [ "x${compiler}" == "xgcc" ] ; then
# export MCPU=cortex-a78ae
#fi
#if [ "x${compiler}" == "xarm" ] ; then
# export MCPU=cortex-a78
#fi
COMPILERS="gcc"
if [ $(which nvfortran) ] ; then
COMPILERS="${COMPILERS} nvhpc"
fi
if [ $(which crayftn) ] ; then
COMPILERS="${COMPILERS} cray"
fi
if [ $(uname -m) == "aarch64" ] ; then
if [ $(which armflang) ] ; then
COMPILERS="${COMPILERS} arm"
fi
if [ $(which frt) ] ; then
COMPILERS="${COMPILERS} fj"
fi
elif [ $(uname -m) == "x86_64" ] ; then
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
COMPILERS="${COMPILERS} oneapi"
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
. /opt/intel/oneapi/setvars.sh >& /dev/null
fi
else
# ^ this detection can be improved
COMPILERS="${COMPILERS} amd"
fi
fi
for compiler in ${COMPILERS} ; do
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
if [ "${HAS_GPU}" != "0" ] ; then
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
if [ "x${compiler}" == "xnvhpc" ] ; then
TARGETS="${TARGETS} CUDA CUDAKernel"
fi
fi
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
TARGETS="${TARGETS} OpenACC OpenACCArray"
fi
for implementation in ${TARGETS} ; do
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
done
done

666
src/fortran/main.F90 Normal file
View File

@ -0,0 +1,666 @@
module BabelStreamUtil
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
use BabelStreamTypes
implicit none
integer(kind=StreamIntKind) :: array_size = 33554432
integer(kind=StreamIntKind) :: num_times = 100
logical :: mibibytes = .false.
logical :: use_gigs = .false.
logical :: csv = .false.
character(len=1), parameter :: csv_sep = ","
! 1 = All
! 2 = Triad
! 3 = Nstream
integer :: selection = 1
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
contains
function get_wtime() result(t)
#if defined(USE_OMP_GET_WTIME)
use omp_lib
implicit none
real(kind=REAL64) :: t
t = omp_get_wtime()
#elif defined(USE_CPU_TIME)
implicit none
real(kind=REAL64) :: t
real :: r
call cpu_time(r)
t = r
#else
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
#endif
end function get_wtime
subroutine parseArguments()
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream, only: list_devices, set_device
#elif defined(USE_ARRAY)
use ArrayStream, only: list_devices, set_device
#elif defined(USE_OPENMP)
use OpenMPStream, only: list_devices, set_device
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream, only: list_devices, set_device
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream, only: list_devices, set_device
#elif defined(USE_OPENACC)
use OpenACCStream, only: list_devices, set_device
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream, only: list_devices, set_device
#elif defined(USE_CUDA)
use CUDAStream, only: list_devices, set_device
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream, only: list_devices, set_device
#elif defined(USE_SEQUENTIAL)
use SequentialStream, only: list_devices, set_device
#endif
implicit none
integer :: i, argc
integer :: arglen,err,pos(2)
character(len=64) :: argtmp
argc = command_argument_count()
do i=1,argc
call get_command_argument(i,argtmp,arglen,err)
if (err.eq.0) then
!
! list devices
!
pos(1) = index(argtmp,"--list")
if (pos(1).eq.1) then
call list_devices()
stop
endif
!
! set device number
!
pos(1) = index(argtmp,"--device")
if (pos(1).eq.1) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
stop
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer :: dev
read(argtmp,'(i15)') dev
call set_device(dev)
end block
endif
cycle
endif
!
! array size
!
pos(1) = index(argtmp,"--arraysize")
pos(2) = index(argtmp,"-s")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer(kind=INT64) :: big_size
read(argtmp,'(i15)') big_size
if (big_size .gt. HUGE(array_size)) then
print*,'Array size does not fit into integer:'
print*,big_size,'>',HUGE(array_size)
print*,'Stop using USE_INT32'
stop
else
array_size = INT(big_size,kind=StreamIntKind)
endif
end block
endif
cycle
endif
!
! number of iterations
!
pos(1) = index(argtmp,"--numtimes")
pos(2) = index(argtmp,"-n")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
read(argtmp,'(i15)') num_times
if (num_times.lt.2) then
write(*,'(a)') "Number of times must be 2 or more"
stop
end if
endif
cycle
endif
!
! precision
!
pos(1) = index(argtmp,"--float")
if (pos(1).eq.1) then
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
"to run BabelStream in single precision."
stop
endif
!
! selection (All, Triad, Nstream)
!
pos(1) = index(argtmp,"--triad-only")
if (pos(1).eq.1) then
selection = 2
cycle
endif
pos(1) = index(argtmp,"--nstream-only")
if (pos(1).eq.1) then
selection = 3
cycle
endif
!
! CSV
!
pos(1) = index(argtmp,"--csv")
if (pos(1).eq.1) then
csv = .true.
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
!stop
endif
!
! units
!
pos(1) = index(argtmp,"--mibibytes")
if (pos(1).eq.1) then
mibibytes = .true.
cycle
endif
!
! giga/gibi instead of mega/mebi
!
pos(1) = index(argtmp,"--gigs")
if (pos(1).eq.1) then
use_gigs = .true.
cycle
endif
!
!
!
pos(1) = index(argtmp,"--compiler-info")
if (pos(1).eq.1) then
write(*,'(a)') 'Compiler version: ',compiler_version()
write(*,'(a)') 'Compiler options: ',compiler_options()
stop
endif
!
! help
!
pos(1) = index(argtmp,"--help")
pos(2) = index(argtmp,"-h")
if (any(pos(:).eq.1) ) then
call get_command_argument(0,argtmp,arglen,err)
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
write(*,'(a)') "Options:"
write(*,'(a)') " -h --help Print the message"
write(*,'(a)') " --list List available devices"
write(*,'(a)') " --device INDEX Select device at INDEX"
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
!write(*,'(a)') " --float Use floats (rather than doubles)"
write(*,'(a)') " --triad-only Only run triad"
write(*,'(a)') " --nstream-only Only run nstream"
write(*,'(a)') " --csv Output as csv table"
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
stop
endif
end if
end do
end subroutine parseArguments
subroutine run_all(timings, summ)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64), intent(out) :: summ
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call copy()
t2 = get_wtime()
timings(1,i) = t2-t1
t1 = get_wtime()
call mul(startScalar)
t2 = get_wtime()
timings(2,i) = t2-t1
t1 = get_wtime()
call add()
t2 = get_wtime()
timings(3,i) = t2-t1
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(4,i) = t2-t1
t1 = get_wtime()
summ = dot()
t2 = get_wtime()
timings(5,i) = t2-t1
end do
end subroutine run_all
subroutine run_triad(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_triad
subroutine run_nstream(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call nstream(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_nstream
subroutine check_solution(A, B, C, summ)
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
implicit none
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
real(kind=REAL64), intent(in) :: summ
integer(kind=StreamIntKind) :: i
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
real(kind=REAL64) :: scalar
! always use double because of accumulation error
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
logical :: cleanA, cleanB, cleanC, cleanSum
goldA = startA
goldB = startB
goldC = startC
goldSum = 0.0d0
scalar = startScalar
do i=1,num_times
if (selection.eq.1) then
goldC = goldA
goldB = scalar * goldC
goldC = goldA + goldB
goldA = goldB + scalar * goldC
else if (selection.eq.2) then
goldA = goldB + scalar * goldC
else if (selection.eq.3) then
goldA = goldA + goldB + scalar * goldC;
endif
end do
goldSum = goldA * goldB * array_size
cleanA = ALL(IEEE_Is_Normal(A))
cleanB = ALL(IEEE_Is_Normal(B))
cleanC = ALL(IEEE_Is_Normal(C))
cleanSum = IEEE_Is_Normal(summ)
if (.not. cleanA) then
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanB) then
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanC) then
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanSum) then
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
end if
errA = SUM( ABS( A - goldA ) ) / array_size
errB = SUM( ABS( B - goldB ) ) / array_size
errC = SUM( ABS( C - goldC ) ) / array_size
errSum = ABS( (summ - goldSum) / goldSum)
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
if (errA .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
end if
if (errB .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
end if
if (errC .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
end if
if (selection.eq.1) then
if (errSum .gt. 1.0e-8) then
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
end if
endif
end subroutine check_solution
end module BabelStreamUtil
program BabelStream
use BabelStreamUtil
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
integer :: element_size, err
real(kind=REAL64) :: scaling
character(len=3) :: label
real(kind=REAL64), allocatable :: timings(:,:)
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
real(kind=REAL64) :: summ
call parseArguments()
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
if (mibibytes) then
if (use_gigs) then
scaling = 2.0d0**(-30)
label = "GiB"
else
scaling = 2.0d0**(-20)
label = "MiB"
endif
else
if (use_gigs) then
scaling = 1.0d-9
label = "GB"
else
scaling = 1.0d-6
label = "MB"
endif
endif
if (.not.csv) then
write(*,'(a)') "BabelStream Fortran"
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
write(*,'(a16,a)') "Implementation: ", implementation_name
block
character(len=32) :: printout
write(printout,'(i9,1x,a5)') num_times,'times'
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
end block
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
endif ! csv
allocate( timings(5,num_times) )
call alloc(array_size)
call init_arrays(startA, startB, startC)
summ = 0.0d0
timings = -1.0d0
if (selection.eq.1) then
call run_all(timings, summ)
else if (selection.eq.2) then
call run_triad(timings)
else if (selection.eq.3) then
call run_nstream(timings)
endif
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
call read_arrays(h_A, h_B, h_C)
call check_solution(h_A, h_B, h_C, summ)
block
character(len=20) :: printout(8)
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
if (csv) then
write(*,'(a,a1)',advance='no') 'function', csv_sep
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
if (mibibytes) then
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
else
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
endif
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
write(*,'(a,a1)',advance='yes') 'avg_runtime'
else
write(printout(1),'(a8)') 'Function'
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
write(printout(3),'(a9)') 'Min (sec)'
write(printout(4),'(a3)') 'Max'
write(printout(5),'(a7)') 'Average'
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif ! csv
if (selection.eq.1) then
block
integer, parameter :: sizes(5) = [2,2,3,3,2]
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
integer :: i
do i=1,5
tmin = MINVAL(timings(i,2:num_times))
tmax = MAXVAL(timings(i,2:num_times))
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
write(printout(1),'(a)') labels(i)
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
enddo
end block
else if ((selection.eq.2).or.(selection.eq.3)) then
tmin = MINVAL(timings(1,2:num_times))
tmax = MAXVAL(timings(1,2:num_times))
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
if (selection.eq.2) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
write(printout(1),'(a12)') "Triad"
else if (selection.eq.3) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
write(printout(1),'(a12)') "Nstream"
endif
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
endif
end block
call dealloc()
end program BabelStream

25
src/fortran/make.inc.amd Normal file
View File

@ -0,0 +1,25 @@
FC := /opt/rocm/llvm/bin/flang
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
FCFLAGS := -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
ARRAY_FLAG = -fopenmp # libomp.so required
OPENMP_FLAG = -fopenmp
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

39
src/fortran/make.inc.arm Normal file
View File

@ -0,0 +1,39 @@
FC = armflang
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -fopenmp
ARRAY_FLAG = -fopenmp
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

18
src/fortran/make.inc.cray Normal file
View File

@ -0,0 +1,18 @@
FC := ftn
FCFLAGS = -e F -O3
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
ARRAY_FLAG = -h autothread
OPENMP_FLAG = -h omp
OPENACC_FLAG = -h acc
# CPU only
OPENACC_FLAG += -h omp
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

21
src/fortran/make.inc.fj Normal file
View File

@ -0,0 +1,21 @@
FC := frt
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
ARRAY_FLAG = -Kparallel,reduction
OPENMP_FLAG = -fopenmp
OPENACC_FLAG =
# CPU only
OPENACC_FLAG +=
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OPENACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

33
src/fortran/make.inc.gcc Normal file
View File

@ -0,0 +1,33 @@
FC = gfortran
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
ARRAY_FLAG =
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

View File

@ -0,0 +1,70 @@
FC := nvfortran
#FCFLAGS := -O3 -Minform=inform -Minfo=all
FCFLAGS := -O3 -Minform=warn
#TARGET=gpu
TARGET=multicore
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
$(info Ampere detected)
GPU = cc80
endif
ifeq ($(findstring Turing,$(NVARCH)),Turing)
$(info Turing detected)
GPU = cc75
endif
ifeq ($(findstring Volta,$(NVARCH)),Volta)
$(info Volta detected)
GPU = cc70
endif
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
$(info Pascal detected)
GPU = cc60,cc61
endif
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
$(info Jetson AGX Orin detected)
GPU = ccn87,cc86
# figure out Xavier later
#GPU = cc72
endif
ifeq ($(GPU),)
$(error Your GPU architecture could not be detected. Set it manually.)
endif
GPUFLAG = -gpu=$(GPU)
# MARCH=neoverse-v1,neoverse-n1,zen3
ARCH=$(shell uname -m)
ifdef MARCH
ifeq ($(ARCH),aarch64)
ifeq ($(MARCH),neoverse-n1)
FCFLAGS += -tp=$(MARCH)
else
ifeq ($(MARCH),neoverse-v1)
FCFLAGS += -tp=$(MARCH)
else
FCFLAGS += -tp=native
endif
endif
else
FCFLAGS += -tp=$(MARCH)
endif
else
FCFLAGS += -tp=native
endif
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
# set exactly one of these!
#MANAGED = -DUSE_MANAGED -gpu=managed
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
endif

View File

@ -0,0 +1,32 @@
FC := ifx
FCFLAGS = -std18
FCFLAGS += -Ofast -xHOST
FCFLAGS += -qopt-zmm-usage=low
ifeq ($(FC),ifort)
FCFLAGS += -qopt-streaming-stores=always
PARALLEL = -parallel
endif
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
ARRAY_FLAG = -qopenmp $(PARALLEL)
OPENMP_FLAG = -qopenmp
ifeq ($(FC),ifx)
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
endif
OPENACC_FLAG =
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

35
src/fortran/run.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
cat ./run.sh
if [ `uname -s` == Darwin ] ; then
NUM_HWTHREADS=`sysctl -n hw.ncpu`
MEMORY_BYTES=`sysctl -n hw.memsize`
else
NUM_HWTHREADS=`nproc`
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
fi
M=128
export OMP_NUM_THREADS=8
export OMP_PROC_BIND=close
export OMP_PLACES=threads
export ACC_NUM_CORES=${OMP_NUM_THREADS}
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
#if [ "x$compiler" == "xgcc" ] ; then
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
#fi
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
if [ -f BabelStream.${compiler}.${implementation} ] ; then
echo "BabelStream.${compiler}.${implementation}"
ldd BabelStream.${compiler}.${implementation}
time $AFFCONTROL \
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
fi
done
done

View File

@ -9,7 +9,7 @@
#include "hip/hip_runtime.h"
#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
void check_error(void)
{
@ -47,9 +47,15 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
array_size = ARRAY_SIZE;
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
check_error();
// Check buffers fit on the device
hipDeviceProp_t props;
@ -64,15 +70,14 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
check_error();
}
template <class T>
HIPStream<T>::~HIPStream()
{
free(sums);
hipHostFree(sums);
check_error();
hipFree(d_a);
check_error();
@ -80,15 +85,13 @@ HIPStream<T>::~HIPStream()
check_error();
hipFree(d_c);
check_error();
hipFree(d_sum);
check_error();
}
template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = initA;
b[i] = initB;
c[i] = initC;
@ -97,7 +100,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
check_error();
hipDeviceSynchronize();
check_error();
@ -115,18 +118,17 @@ void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
check_error();
}
template <typename T>
__global__ void copy_kernel(const T * a, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i];
}
template <class T>
void HIPStream<T>::copy()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -136,14 +138,14 @@ template <typename T>
__global__ void mul_kernel(T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
b[i] = scalar * c[i];
}
template <class T>
void HIPStream<T>::mul()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -152,14 +154,14 @@ void HIPStream<T>::mul()
template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}
template <class T>
void HIPStream<T>::add()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -169,14 +171,14 @@ template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::triad()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
@ -186,32 +188,32 @@ template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] += b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::nstream()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <class T>
template <typename T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{
__shared__ T tb_sum[TBSIZE];
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t local_i = hipThreadIdx_x;
const size_t local_i = threadIdx.x;
size_t i = blockDim.x * blockIdx.x + local_i;
tb_sum[local_i] = 0.0;
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
for (; i < array_size; i += blockDim.x*gridDim.x)
tb_sum[local_i] += a[i] * b[i];
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
{
__syncthreads();
if (local_i < offset)
@ -221,20 +223,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
}
if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i];
sum[blockIdx.x] = tb_sum[local_i];
}
template <class T>
T HIPStream<T>::dot()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
check_error();
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
hipDeviceSynchronize();
check_error();
T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
for (int i = 0; i < dot_num_blocks; i++)
sum += sums[i];
return sum;

View File

@ -14,13 +14,31 @@
#include "Stream.h"
#define IMPLEMENTATION_STRING "HIP"
#define DOT_READ_DWORDS_PER_LANE 4
template <class T>
class HIPStream : public Stream<T>
{
// Make sure that either:
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
// or
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
// Take into account the datatype size
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
// and 4 FP32 elements
static constexpr unsigned int dot_elements_per_lane{
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
protected:
// Size of arrays
int array_size;
int dot_num_blocks;
// Host array for partial sums for dot kernel
T *sums;
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
T *d_a;
T *d_b;
T *d_c;
T *d_sum;
public:

View File

@ -1,4 +1,4 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this

View File

@ -10,7 +10,6 @@
#include <stdexcept>
#include <Kokkos_Core.hpp>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Kokkos"

View File

@ -1,32 +1,38 @@
register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++")
register_flag_required(KOKKOS_IN_TREE
register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example:
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
# compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
macro(setup)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
if (EXISTS "${KOKKOS_IN_TREE}")
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos)
else ()
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else()
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif ()
register_append_compiler_and_arch_specific_cxx_flags(
@ -36,5 +42,3 @@ macro(setup)
)
endmacro()

View File

@ -1,18 +1,19 @@
register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++")
register_flag_required(RAJA_IN_TREE
register_flag_optional(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
")
" "")
register_flag_optional(RAJA_IN_PACKAGE
"Use if Raja is part of a package dependency:
Path to installation" "")
register_flag_optional(TARGET
"Target offload device, implemented values are CPU, NVIDIA"
@ -76,16 +77,22 @@ macro(setup)
register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
elseif (EXISTS "${RAJA_IN_PACKAGE}")
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
find_package(RAJA REQUIRED)
register_link_library(RAJA)
else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
endif ()
if (ENABLE_CUDA)
# RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
enable_language(CUDA)
set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(src/raja/RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
endif ()

View File

@ -4,13 +4,131 @@ version = 3
[[package]]
name = "ansi_term"
version = "0.11.0"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "async-attributes"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "async-channel"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28"
dependencies = [
"concurrent-queue",
"event-listener",
"futures-core",
]
[[package]]
name = "async-executor"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965"
dependencies = [
"async-task",
"concurrent-queue",
"fastrand",
"futures-lite",
"once_cell",
"slab",
]
[[package]]
name = "async-global-executor"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca"
dependencies = [
"async-channel",
"async-executor",
"async-io",
"async-lock",
"blocking",
"futures-lite",
"once_cell",
]
[[package]]
name = "async-io"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7"
dependencies = [
"autocfg",
"concurrent-queue",
"futures-lite",
"libc",
"log",
"once_cell",
"parking",
"polling",
"slab",
"socket2",
"waker-fn",
"winapi 0.3.9",
]
[[package]]
name = "async-lock"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6"
dependencies = [
"event-listener",
]
[[package]]
name = "async-std"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d"
dependencies = [
"async-attributes",
"async-channel",
"async-global-executor",
"async-io",
"async-lock",
"crossbeam-utils",
"futures-channel",
"futures-core",
"futures-io",
"futures-lite",
"gloo-timers",
"kv-log-macro",
"log",
"memchr",
"once_cell",
"pin-project-lite",
"pin-utils",
"slab",
"wasm-bindgen-futures",
]
[[package]]
name = "async-task"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524"
[[package]]
name = "atomic-waker"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a"
[[package]]
name = "atty"
version = "0.2.14"
@ -24,9 +142,9 @@ dependencies = [
[[package]]
name = "autocfg"
version = "1.0.1"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bitflags"
@ -34,6 +152,38 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "blocking"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc"
dependencies = [
"async-channel",
"async-task",
"atomic-waker",
"fastrand",
"futures-lite",
"once_cell",
]
[[package]]
name = "bumpalo"
version = "3.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d"
[[package]]
name = "cache-padded"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c"
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "2.33.3"
version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [
"ansi_term",
"atty",
@ -64,6 +214,15 @@ dependencies = [
"crossterm",
]
[[package]]
name = "concurrent-queue"
version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c"
dependencies = [
"cache-padded",
]
[[package]]
name = "core_affinity"
version = "0.5.10"
@ -78,9 +237,9 @@ dependencies = [
[[package]]
name = "crossbeam"
version = "0.8.1"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845"
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
dependencies = [
"cfg-if",
"crossbeam-channel",
@ -92,9 +251,9 @@ dependencies = [
[[package]]
name = "crossbeam-channel"
version = "0.5.1"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
dependencies = [
"cfg-if",
"crossbeam-utils",
@ -102,9 +261,9 @@ dependencies = [
[[package]]
name = "crossbeam-deque"
version = "0.8.1"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
dependencies = [
"cfg-if",
"crossbeam-epoch",
@ -113,22 +272,22 @@ dependencies = [
[[package]]
name = "crossbeam-epoch"
version = "0.9.5"
version = "0.9.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"lazy_static",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-queue"
version = "0.3.2"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9"
checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7"
dependencies = [
"cfg-if",
"crossbeam-utils",
@ -136,12 +295,11 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.5"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac"
dependencies = [
"cfg-if",
"lazy_static",
]
[[package]]
@ -170,10 +328,157 @@ dependencies = [
]
[[package]]
name = "either"
version = "1.6.1"
name = "ctor"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "fastrand"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
dependencies = [
"instant",
]
[[package]]
name = "futures"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf"
[[package]]
name = "futures-executor"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68"
[[package]]
name = "futures-lite"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48"
dependencies = [
"fastrand",
"futures-core",
"futures-io",
"memchr",
"parking",
"pin-project-lite",
"waker-fn",
]
[[package]]
name = "futures-macro"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56"
[[package]]
name = "futures-task"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"
[[package]]
name = "futures-timer"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
name = "gloo-timers"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9"
dependencies = [
"futures-channel",
"futures-core",
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "heck"
@ -202,6 +507,15 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "js-sys"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
@ -212,6 +526,15 @@ dependencies = [
"winapi-build",
]
[[package]]
name = "kv-log-macro"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
dependencies = [
"log",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.108"
version = "0.2.134"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119"
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
[[package]]
name = "lock_api"
version = "0.4.5"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109"
checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.14"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
"value-bag",
]
[[package]]
name = "memoffset"
version = "0.6.4"
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memoffset"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
dependencies = [
"autocfg",
]
@ -275,32 +606,44 @@ dependencies = [
[[package]]
name = "ntapi"
version = "0.3.6"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "num-traits"
version = "0.2.14"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "once_cell"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
[[package]]
name = "parking"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
[[package]]
name = "parking_lot"
version = "0.11.2"
@ -327,12 +670,29 @@ dependencies = [
]
[[package]]
name = "pest"
version = "2.1.3"
name = "pin-project-lite"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "polling"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011"
dependencies = [
"ucd-trie",
"autocfg",
"cfg-if",
"libc",
"log",
"wepoll-ffi",
"winapi 0.3.9",
]
[[package]]
@ -361,27 +721,27 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.32"
version = "1.0.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43"
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
dependencies = [
"unicode-xid",
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.10"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.5.1"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
dependencies = [
"autocfg",
"crossbeam-deque",
@ -391,31 +751,43 @@ dependencies = [
[[package]]
name = "rayon-core"
version = "1.9.1"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "redox_syscall"
version = "0.2.10"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
]
[[package]]
name = "rstest"
version = "0.10.0"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af"
checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc"
dependencies = [
"async-std",
"futures",
"futures-timer",
"rstest_macros",
"rustc_version",
]
[[package]]
name = "rstest_macros"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116"
dependencies = [
"cfg-if",
"proc-macro2",
@ -426,7 +798,7 @@ dependencies = [
[[package]]
name = "rust-stream"
version = "3.4.0"
version = "4.0.0"
dependencies = [
"colour",
"core_affinity",
@ -443,18 +815,18 @@ dependencies = [
[[package]]
name = "rustc_version"
version = "0.3.3"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee"
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
dependencies = [
"semver",
]
[[package]]
name = "rustversion"
version = "1.0.5"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"
[[package]]
name = "scopeguard"
@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "semver"
version = "0.11.0"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
dependencies = [
"semver-parser",
]
[[package]]
name = "semver-parser"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
dependencies = [
"pest",
]
checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4"
[[package]]
name = "signal-hook"
@ -501,10 +861,29 @@ dependencies = [
]
[[package]]
name = "smallvec"
version = "1.7.0"
name = "slab"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309"
checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
dependencies = [
"autocfg",
]
[[package]]
name = "smallvec"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
[[package]]
name = "socket2"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]]
name = "strsim"
@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "structopt"
version = "0.3.25"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c"
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
dependencies = [
"clap",
"lazy_static",
@ -538,20 +917,20 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.82"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59"
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
"unicode-ident",
]
[[package]]
name = "tabular"
version = "0.1.4"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b"
checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1"
dependencies = [
"unicode-width",
]
@ -566,28 +945,32 @@ dependencies = [
]
[[package]]
name = "ucd-trie"
version = "0.1.3"
name = "unicode-ident"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
[[package]]
name = "unicode-segmentation"
version = "1.8.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b"
checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
[[package]]
name = "unicode-width"
version = "0.1.9"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "unicode-xid"
version = "0.2.2"
name = "value-bag"
version = "1.0.0-alpha.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
dependencies = [
"ctor",
"version_check",
]
[[package]]
name = "vec_map"
@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "version_check"
version = "0.9.3"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "waker-fn"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
[[package]]
name = "wasm-bindgen"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d"
dependencies = [
"cfg-if",
"js-sys",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f"
[[package]]
name = "web-sys"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "wepoll-ffi"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb"
dependencies = [
"cc",
]
[[package]]
name = "winapi"

View File

@ -7,19 +7,19 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
num-traits = "0.2.14"
structopt = "0.3.13"
tabular = "0.1.4"
rayon = "1.5.1"
crossbeam = "0.8.1"
num_cpus = "1.13.0"
rustversion = "1.0"
libc = "0.2.97"
num-traits = "0.2.15"
structopt = "0.3.26"
tabular = "0.2.0"
rayon = "1.5.3"
crossbeam = "0.8.2"
num_cpus = "1.13.1"
rustversion = "1.0.9"
libc = "0.2.134"
core_affinity = "0.5.10"
colour = "0.6.0"
[dev-dependencies]
rstest = "0.10.0"
rstest = "0.13.0"
[build-dependencies]
rustversion = "1.0"

View File

@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER
"Compile using the specified SYCL compiler implementation
Supported values are
ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
ONEAPI-ICPX - icpx as a standalone compiler
DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm)
HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER
register_flag_optional(SYCL_COMPILER_DIR
"Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first)
ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
"")
@ -65,6 +67,12 @@ macro(setup)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP")
set(CMAKE_CXX_COMPILER dpcpp)
register_definitions(CL_TARGET_OPENCL_VERSION=220)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
set(CMAKE_CXX_COMPILER icpx)
include_directories(${SYCL_COMPILER_DIR}/include/sycl)
register_definitions(CL_TARGET_OPENCL_VERSION=220)
register_append_cxx_flags(ANY -fsycl)
register_append_link_flags(-fsycl)
else ()
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
endif ()

View File

@ -145,7 +145,7 @@ T ThrustStream<T>::dot()
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__)
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)