Merge branch 'develop' into option_for_vec
# Conflicts: # src/kokkos/KokkosStream.hpp # src/kokkos/model.cmake
This commit is contained in:
commit
50fe7c102a
6
.gitignore
vendored
6
.gitignore
vendored
@ -10,12 +10,18 @@ sycl-stream
|
||||
hip-stream
|
||||
tbb-stream
|
||||
|
||||
src/fortran/BabelStream
|
||||
src/fortran/BabelStream.*
|
||||
|
||||
*.o
|
||||
*.bc
|
||||
*.sycl
|
||||
*.tar
|
||||
*.gz
|
||||
*.a
|
||||
*.mod
|
||||
*.cub
|
||||
*.ptx
|
||||
|
||||
KokkosCore_config.*
|
||||
|
||||
|
||||
@ -2,8 +2,16 @@
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## Unreleased
|
||||
### Added
|
||||
- Ability to build Kokkos and RAJA versions against existing packages.
|
||||
|
||||
### Changed
|
||||
- RAJA CUDA CMake build issues resolved.
|
||||
- Fix CUDA memory limit check.
|
||||
- Use long double for `check_solution` in case of large problem size.
|
||||
- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version.
|
||||
- Updates to the HIP kernels and API usage.
|
||||
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
|
||||
|
||||
## [v4.0] - 2021-12-22
|
||||
|
||||
|
||||
@ -186,7 +186,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F
|
||||
# setup common build flag defaults if there are no overrides
|
||||
if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
|
||||
set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS})
|
||||
elseif ()
|
||||
else ()
|
||||
set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS})
|
||||
endif ()
|
||||
|
||||
|
||||
@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/<model>`.
|
||||
|
||||
Currently available models are:
|
||||
```
|
||||
omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
||||
omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust
|
||||
```
|
||||
|
||||
#### Overriding default flags
|
||||
@ -163,11 +163,11 @@ Pull Requests should be made against the `develop` branch.
|
||||
|
||||
Please cite BabelStream via this reference:
|
||||
|
||||
Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
|
||||
Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018. DOI: 10.1504/IJCSE.2018.095847
|
||||
|
||||
### Other BabelStream publications
|
||||
|
||||
* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847
|
||||
* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34
|
||||
|
||||
* Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
|
||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
|
||||
|
||||
2
src/.gitignore
vendored
2
src/.gitignore
vendored
@ -16,6 +16,8 @@
|
||||
**/*.gz
|
||||
**/*.a
|
||||
|
||||
**/*.swp
|
||||
|
||||
**/KokkosCore_Config_*
|
||||
|
||||
**/.DS_Store
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
#endif
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
#define DOT_NUM_BLOCKS 1024
|
||||
|
||||
template <class T>
|
||||
class CUDAStream : public Stream<T>
|
||||
|
||||
105
src/fortran/ArrayStream.F90
Normal file
105
src/fortran/ArrayStream.F90
Normal file
@ -0,0 +1,105 @@
|
||||
module ArrayStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=5), parameter :: implementation_name = "Array"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
C = A
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
C = A + B
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
B = scalar * C
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
A = B + scalar * C
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
A = A + B + scalar * C
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
s = dot_product(A,B)
|
||||
end function dot
|
||||
|
||||
end module ArrayStream
|
||||
21
src/fortran/BabelStreamTypes.F90
Normal file
21
src/fortran/BabelStreamTypes.F90
Normal file
@ -0,0 +1,21 @@
|
||||
module BabelStreamTypes
|
||||
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
|
||||
|
||||
implicit none
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
integer, parameter :: StreamRealKind = REAL32
|
||||
character(len=6) :: StreamRealName = "REAL32"
|
||||
#else
|
||||
integer, parameter :: StreamRealKind = REAL64
|
||||
character(len=6) :: StreamRealName = "REAL64"
|
||||
#endif
|
||||
|
||||
#ifdef USE_INT32
|
||||
#warning There is no checking for overflowing INT32, so be careful.
|
||||
integer, parameter :: StreamIntKind = INT32
|
||||
#else
|
||||
integer, parameter :: StreamIntKind = INT64
|
||||
#endif
|
||||
|
||||
end module BabelStreamTypes
|
||||
230
src/fortran/CUDAKernelStream.F90
Normal file
230
src/fortran/CUDAKernelStream.F90
Normal file
@ -0,0 +1,230 @@
|
||||
module CUDAKernelStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=10), parameter :: implementation_name = "CUDAKernel"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_MANAGED
|
||||
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use cudafor
|
||||
implicit none
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use cudafor
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.ge.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
err = cudaSetDevice(dev)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaSetDevice failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
end if
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(r)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64) :: r
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end function dot
|
||||
|
||||
end module CUDAKernelStream
|
||||
309
src/fortran/CUDAStream.F90
Normal file
309
src/fortran/CUDAStream.F90
Normal file
@ -0,0 +1,309 @@
|
||||
module CUDAFortranKernels
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
contains
|
||||
|
||||
attributes(global) subroutine do_copy(n,A,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n)
|
||||
real(kind=REAL64), intent(out) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
C(i) = A(i)
|
||||
endif
|
||||
end subroutine do_copy
|
||||
|
||||
attributes(global) subroutine do_add(n,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n), B(n)
|
||||
real(kind=REAL64), intent(out) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
C(i) = A(i) + B(i)
|
||||
endif
|
||||
end subroutine do_add
|
||||
|
||||
attributes(global) subroutine do_mul(n,scalar,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(out) :: B(n)
|
||||
real(kind=REAL64), intent(in) :: C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
B(i) = scalar * C(i)
|
||||
endif
|
||||
end subroutine do_mul
|
||||
|
||||
attributes(global) subroutine do_triad(n,scalar,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(out) :: A(n)
|
||||
real(kind=REAL64), intent(in) :: B(n), C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
endif
|
||||
end subroutine do_triad
|
||||
|
||||
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in), value :: scalar
|
||||
real(kind=REAL64), intent(inout) :: A(n)
|
||||
real(kind=REAL64), intent(in) :: B(n), C(n)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
|
||||
if (i <= N) then
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
endif
|
||||
end subroutine do_nstream
|
||||
|
||||
#if 0
|
||||
attributes(global) subroutine do_dot(n,A,B,r)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind), intent(in), value :: n
|
||||
real(kind=REAL64), intent(in) :: A(n), B(n)
|
||||
real(kind=REAL64), intent(out) :: r
|
||||
integer(kind=StreamIntKind) :: i
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
end subroutine do_dot
|
||||
#endif
|
||||
|
||||
end module CUDAFortranKernels
|
||||
|
||||
module CUDAStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
use cudafor, only: dim3
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=4), parameter :: implementation_name = "CUDA"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_MANAGED
|
||||
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
type(dim3) :: grid, tblock
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use cudafor
|
||||
implicit none
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use cudafor
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num, err
|
||||
err = cudaGetDeviceCount(num)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaGetDeviceCount failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
else if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.ge.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
err = cudaSetDevice(dev)
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaSetDevice failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
end if
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
! move to separate subroutine later
|
||||
tblock = dim3(128,1,1)
|
||||
grid = dim3(ceiling(real(N)/tblock%x),1,1)
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
integer :: err
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
use CUDAFortranKernels, only: do_copy
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer :: err
|
||||
call do_copy<<<grid, tblock>>>(N, A, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
use CUDAFortranKernels, only: do_add
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
integer :: err
|
||||
call do_add<<<grid, tblock>>>(N, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
use CUDAFortranKernels, only: do_mul
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
use CUDAFortranKernels, only: do_triad
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
use CUDAFortranKernels, only: do_nstream
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer :: err
|
||||
scalar = startScalar
|
||||
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(r)
|
||||
!use CUDAFortranKernels, only: do_dot
|
||||
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
|
||||
implicit none
|
||||
real(kind=REAL64) :: r
|
||||
integer :: err
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!call do_dot<<<grid, tblock>>>(N, B, C, r)
|
||||
r = real(0,kind=REAL64)
|
||||
!$cuf kernel do <<< *, * >>>
|
||||
do i=1,N
|
||||
r = r + A(i) * B(i)
|
||||
end do
|
||||
err = cudaDeviceSynchronize()
|
||||
if (err.ne.0) then
|
||||
write(*,'(a)') "cudaDeviceSynchronize failed"
|
||||
write(*,'(a)') cudaGetErrorString(err)
|
||||
stop
|
||||
endif
|
||||
end function dot
|
||||
|
||||
end module CUDAStream
|
||||
139
src/fortran/DoConcurrentStream.F90
Normal file
139
src/fortran/DoConcurrentStream.F90
Normal file
@ -0,0 +1,139 @@
|
||||
module DoConcurrentStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "DoConcurrent"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
#ifdef USE_DEVICE
|
||||
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
|
||||
#else
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
#endif
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N)
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,C)
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(B,C)
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do concurrent (i=1:N) !shared(A,B,C)
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! reduction omitted because NVF infers it and other compilers do not support
|
||||
s = real(0,kind=REAL64)
|
||||
#ifdef CRAY_THREAD_DOCONCURRENT
|
||||
do i=1,N
|
||||
#else
|
||||
do concurrent (i=1:N) !shared(A,B)
|
||||
#endif
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module DoConcurrentStream
|
||||
109
src/fortran/Makefile
Normal file
109
src/fortran/Makefile
Normal file
@ -0,0 +1,109 @@
|
||||
ifeq ($(COMPILER),nvhpc)
|
||||
include make.inc.nvhpc
|
||||
else ifeq ($(COMPILER),oneapi)
|
||||
include make.inc.oneapi
|
||||
else ifeq ($(COMPILER),gcc)
|
||||
include make.inc.gcc
|
||||
else ifeq ($(COMPILER),amd)
|
||||
include make.inc.amd
|
||||
else ifeq ($(COMPILER),arm)
|
||||
include make.inc.arm
|
||||
else ifeq ($(COMPILER),cray)
|
||||
include make.inc.cray
|
||||
else ifeq ($(COMPILER),fj)
|
||||
include make.inc.fj
|
||||
else
|
||||
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
|
||||
include make.inc.gcc
|
||||
COMPILER=gcc
|
||||
endif
|
||||
|
||||
FCFLAGS += -DVERSION_STRING="4.0"
|
||||
#FCFLAGS += -DUSE_INT32
|
||||
|
||||
ifeq ($(IMPLEMENTATION),DoConcurrent)
|
||||
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
|
||||
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),Array)
|
||||
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
|
||||
IMPLEMENTATION_OBJECT = ArrayStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMP)
|
||||
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
|
||||
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
|
||||
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
|
||||
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
|
||||
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenACCStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
|
||||
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),CUDA)
|
||||
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
|
||||
IMPLEMENTATION_OBJECT = CUDAStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),CUDAKernel)
|
||||
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
|
||||
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
|
||||
|
||||
else ifeq ($(IMPLEMENTATION),Sequential)
|
||||
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
|
||||
IMPLEMENTATION_OBJECT = SequentialStream.o
|
||||
|
||||
else
|
||||
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
|
||||
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
|
||||
IMPLEMENTATION=Sequential
|
||||
IMPLEMENTATION_OBJECT = SequentialStream.o
|
||||
|
||||
endif
|
||||
|
||||
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
|
||||
|
||||
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
|
||||
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
|
||||
|
||||
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
|
||||
$(FC) $(FCFLAGS) -c $<
|
||||
|
||||
%.o: %.F90 BabelStreamTypes.mod
|
||||
$(FC) $(FCFLAGS) -c $<
|
||||
|
||||
clean:
|
||||
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
|
||||
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
|
||||
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
|
||||
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
|
||||
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
|
||||
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
|
||||
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
|
||||
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
|
||||
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
|
||||
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
|
||||
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
|
||||
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
|
||||
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
|
||||
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
|
||||
-rm -f *.modmic *.mod *.o *.cub *.ptx
|
||||
|
||||
realclean: clean
|
||||
-rm -f BabelStream.*
|
||||
144
src/fortran/OpenACCArrayStream.F90
Normal file
144
src/fortran/OpenACCArrayStream.F90
Normal file
@ -0,0 +1,144 @@
|
||||
module OpenACCArrayStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "OpenACCArray"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use openacc
|
||||
implicit none
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use openacc
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call acc_set_device_num(dev, acc_get_device_type())
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$acc enter data create(A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$acc exit data delete(A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
!$acc kernels
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
!$acc end kernels
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
!$acc kernels
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
!$acc end kernels
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
!$acc kernels
|
||||
C = A
|
||||
!$acc end kernels
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
!$acc kernels
|
||||
C = A + B
|
||||
!$acc end kernels
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
B = scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
A = B + scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$acc kernels
|
||||
A = A + B + scalar * C
|
||||
!$acc end kernels
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
!$acc kernels
|
||||
s = dot_product(A,B)
|
||||
!$acc end kernels
|
||||
end function dot
|
||||
|
||||
end module OpenACCArrayStream
|
||||
161
src/fortran/OpenACCStream.F90
Normal file
161
src/fortran/OpenACCStream.F90
Normal file
@ -0,0 +1,161 @@
|
||||
module OpenACCStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=7), parameter :: implementation_name = "OpenACC"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use openacc
|
||||
implicit none
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use openacc
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = acc_get_num_devices(acc_get_device_type())
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call acc_set_device_num(dev, acc_get_device_type())
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$acc enter data create(A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$acc exit data delete(A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$acc parallel loop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$acc parallel loop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenACCStream
|
||||
137
src/fortran/OpenMPStream.F90
Normal file
137
src/fortran/OpenMPStream.F90
Normal file
@ -0,0 +1,137 @@
|
||||
module OpenMPStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=6), parameter :: implementation_name = "OpenMP"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel do simd
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp parallel do simd reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPStream
|
||||
162
src/fortran/OpenMPTargetLoopStream.F90
Normal file
162
src/fortran/OpenMPTargetLoopStream.F90
Normal file
@ -0,0 +1,162 @@
|
||||
module OpenMPTargetLoopStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call omp_set_default_device(dev)
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target enter data map(alloc: A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target exit data map(delete: A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! this might need to use a copy API instead...
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams loop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp target teams loop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPTargetLoopStream
|
||||
163
src/fortran/OpenMPTargetStream.F90
Normal file
163
src/fortran/OpenMPTargetStream.F90
Normal file
@ -0,0 +1,163 @@
|
||||
module OpenMPTargetStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=12), parameter :: implementation_name = "OpenMPTarget"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
else
|
||||
write(*,'(a10,i1,a8)') "There are ",num," devices."
|
||||
end if
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
integer :: num
|
||||
num = omp_get_num_devices()
|
||||
if (num.eq.0) then
|
||||
write(*,'(a17)') "No devices found."
|
||||
stop
|
||||
else if (dev.gt.num) then
|
||||
write(*,'(a21)') "Invalid device index."
|
||||
stop
|
||||
else
|
||||
call omp_set_default_device(dev)
|
||||
end if
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target enter data map(alloc: A,B,C)
|
||||
#endif
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
#ifndef USE_MANAGED
|
||||
!$omp target exit data map(delete: A,B,C)
|
||||
#endif
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
! this might need to use a copy API instead...
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
!$omp barrier
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp target teams distribute parallel do simd
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp target teams distribute parallel do simd reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module OpenMPTargetStream
|
||||
169
src/fortran/OpenMPTaskloopStream.F90
Normal file
169
src/fortran/OpenMPTaskloopStream.F90
Normal file
@ -0,0 +1,169 @@
|
||||
module OpenMPTaskloopStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
!$omp parallel
|
||||
!$omp master
|
||||
!$omp taskloop reduction(+:s)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
!$omp end master
|
||||
!$omp end parallel
|
||||
end function dot
|
||||
|
||||
end module OpenMPTaskloopStream
|
||||
120
src/fortran/OpenMPWorkshareStream.F90
Normal file
120
src/fortran/OpenMPWorkshareStream.F90
Normal file
@ -0,0 +1,120 @@
|
||||
module OpenMPWorkshareStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
!$omp parallel workshare
|
||||
A = initA
|
||||
B = initB
|
||||
C = initC
|
||||
!$omp end parallel workshare
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
!$omp parallel workshare
|
||||
h_A = A
|
||||
h_B = B
|
||||
h_C = C
|
||||
!$omp end parallel workshare
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
!$omp parallel workshare
|
||||
C = A
|
||||
!$omp end parallel workshare
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
!$omp parallel workshare
|
||||
C = A + B
|
||||
!$omp end parallel workshare
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
B = scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
A = B + scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
scalar = startScalar
|
||||
!$omp parallel workshare
|
||||
A = A + B + scalar * C
|
||||
!$omp end parallel workshare
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
!$omp parallel workshare
|
||||
s = dot_product(A,B)
|
||||
!$omp end parallel workshare
|
||||
end function dot
|
||||
|
||||
end module OpenMPWorkshareStream
|
||||
130
src/fortran/SequentialStream.F90
Normal file
130
src/fortran/SequentialStream.F90
Normal file
@ -0,0 +1,130 @@
|
||||
module SequentialStream
|
||||
use, intrinsic :: ISO_Fortran_env
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
character(len=10), parameter :: implementation_name = "Sequential"
|
||||
|
||||
integer(kind=StreamIntKind) :: N
|
||||
|
||||
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine list_devices()
|
||||
implicit none
|
||||
integer :: num
|
||||
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
|
||||
end subroutine list_devices
|
||||
|
||||
subroutine set_device(dev)
|
||||
implicit none
|
||||
integer, intent(in) :: dev
|
||||
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
|
||||
end subroutine set_device
|
||||
|
||||
subroutine alloc(array_size)
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: array_size
|
||||
integer :: err
|
||||
N = array_size
|
||||
allocate( A(1:N), B(1:N), C(1:N), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine alloc
|
||||
|
||||
subroutine dealloc()
|
||||
implicit none
|
||||
integer :: err
|
||||
deallocate( A, B, C, stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'deallocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
end subroutine dealloc
|
||||
|
||||
subroutine init_arrays(initA, initB, initC)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: initA, initB, initC
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
A(i) = initA
|
||||
B(i) = initB
|
||||
C(i) = initC
|
||||
end do
|
||||
end subroutine init_arrays
|
||||
|
||||
subroutine read_arrays(h_A, h_B, h_C)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
h_A(i) = A(i)
|
||||
h_B(i) = B(i)
|
||||
h_C(i) = C(i)
|
||||
end do
|
||||
end subroutine read_arrays
|
||||
|
||||
subroutine copy()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
C(i) = A(i)
|
||||
end do
|
||||
end subroutine copy
|
||||
|
||||
subroutine add()
|
||||
implicit none
|
||||
integer(kind=StreamIntKind) :: i
|
||||
do i=1,N
|
||||
C(i) = A(i) + B(i)
|
||||
end do
|
||||
end subroutine add
|
||||
|
||||
subroutine mul(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
B(i) = scalar * C(i)
|
||||
end do
|
||||
end subroutine mul
|
||||
|
||||
subroutine triad(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
A(i) = B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine triad
|
||||
|
||||
subroutine nstream(startScalar)
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: startScalar
|
||||
real(kind=REAL64) :: scalar
|
||||
integer(kind=StreamIntKind) :: i
|
||||
scalar = startScalar
|
||||
do i=1,N
|
||||
A(i) = A(i) + B(i) + scalar * C(i)
|
||||
end do
|
||||
end subroutine nstream
|
||||
|
||||
function dot() result(s)
|
||||
implicit none
|
||||
real(kind=REAL64) :: s
|
||||
integer(kind=StreamIntKind) :: i
|
||||
s = real(0,kind=REAL64)
|
||||
do i=1,N
|
||||
s = s + A(i) * B(i)
|
||||
end do
|
||||
end function dot
|
||||
|
||||
end module SequentialStream
|
||||
54
src/fortran/build.sh
Executable file
54
src/fortran/build.sh
Executable file
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
# uncomment to disable GPU targets
|
||||
#HAS_GPU=0
|
||||
|
||||
# Orin
|
||||
#if [ "x${compiler}" == "xgcc" ] ; then
|
||||
# export MCPU=cortex-a78ae
|
||||
#fi
|
||||
#if [ "x${compiler}" == "xarm" ] ; then
|
||||
# export MCPU=cortex-a78
|
||||
#fi
|
||||
|
||||
COMPILERS="gcc"
|
||||
if [ $(which nvfortran) ] ; then
|
||||
COMPILERS="${COMPILERS} nvhpc"
|
||||
fi
|
||||
if [ $(which crayftn) ] ; then
|
||||
COMPILERS="${COMPILERS} cray"
|
||||
fi
|
||||
if [ $(uname -m) == "aarch64" ] ; then
|
||||
if [ $(which armflang) ] ; then
|
||||
COMPILERS="${COMPILERS} arm"
|
||||
fi
|
||||
if [ $(which frt) ] ; then
|
||||
COMPILERS="${COMPILERS} fj"
|
||||
fi
|
||||
elif [ $(uname -m) == "x86_64" ] ; then
|
||||
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
|
||||
COMPILERS="${COMPILERS} oneapi"
|
||||
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
|
||||
. /opt/intel/oneapi/setvars.sh >& /dev/null
|
||||
fi
|
||||
else
|
||||
# ^ this detection can be improved
|
||||
COMPILERS="${COMPILERS} amd"
|
||||
fi
|
||||
fi
|
||||
|
||||
for compiler in ${COMPILERS} ; do
|
||||
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
|
||||
if [ "${HAS_GPU}" != "0" ] ; then
|
||||
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
|
||||
if [ "x${compiler}" == "xnvhpc" ] ; then
|
||||
TARGETS="${TARGETS} CUDA CUDAKernel"
|
||||
fi
|
||||
fi
|
||||
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
|
||||
TARGETS="${TARGETS} OpenACC OpenACCArray"
|
||||
fi
|
||||
for implementation in ${TARGETS} ; do
|
||||
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
|
||||
done
|
||||
done
|
||||
666
src/fortran/main.F90
Normal file
666
src/fortran/main.F90
Normal file
@ -0,0 +1,666 @@
|
||||
module BabelStreamUtil
|
||||
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
|
||||
use BabelStreamTypes
|
||||
|
||||
implicit none
|
||||
|
||||
integer(kind=StreamIntKind) :: array_size = 33554432
|
||||
integer(kind=StreamIntKind) :: num_times = 100
|
||||
logical :: mibibytes = .false.
|
||||
logical :: use_gigs = .false.
|
||||
logical :: csv = .false.
|
||||
character(len=1), parameter :: csv_sep = ","
|
||||
|
||||
! 1 = All
|
||||
! 2 = Triad
|
||||
! 3 = Nstream
|
||||
integer :: selection = 1
|
||||
|
||||
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
|
||||
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
|
||||
|
||||
contains
|
||||
|
||||
function get_wtime() result(t)
|
||||
#if defined(USE_OMP_GET_WTIME)
|
||||
use omp_lib
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
t = omp_get_wtime()
|
||||
#elif defined(USE_CPU_TIME)
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
real :: r
|
||||
call cpu_time(r)
|
||||
t = r
|
||||
#else
|
||||
implicit none
|
||||
real(kind=REAL64) :: t
|
||||
integer(kind=INT64) :: c, r
|
||||
call system_clock(count = c, count_rate = r)
|
||||
t = real(c,REAL64) / real(r,REAL64)
|
||||
#endif
|
||||
end function get_wtime
|
||||
|
||||
subroutine parseArguments()
|
||||
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream, only: list_devices, set_device
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream, only: list_devices, set_device
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream, only: list_devices, set_device
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream, only: list_devices, set_device
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream, only: list_devices, set_device
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream, only: list_devices, set_device
|
||||
#endif
|
||||
implicit none
|
||||
integer :: i, argc
|
||||
integer :: arglen,err,pos(2)
|
||||
character(len=64) :: argtmp
|
||||
argc = command_argument_count()
|
||||
do i=1,argc
|
||||
call get_command_argument(i,argtmp,arglen,err)
|
||||
if (err.eq.0) then
|
||||
!
|
||||
! list devices
|
||||
!
|
||||
pos(1) = index(argtmp,"--list")
|
||||
if (pos(1).eq.1) then
|
||||
call list_devices()
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! set device number
|
||||
!
|
||||
pos(1) = index(argtmp,"--device")
|
||||
if (pos(1).eq.1) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
stop
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
block
|
||||
integer :: dev
|
||||
read(argtmp,'(i15)') dev
|
||||
call set_device(dev)
|
||||
end block
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! array size
|
||||
!
|
||||
pos(1) = index(argtmp,"--arraysize")
|
||||
pos(2) = index(argtmp,"-s")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
block
|
||||
integer(kind=INT64) :: big_size
|
||||
read(argtmp,'(i15)') big_size
|
||||
if (big_size .gt. HUGE(array_size)) then
|
||||
print*,'Array size does not fit into integer:'
|
||||
print*,big_size,'>',HUGE(array_size)
|
||||
print*,'Stop using USE_INT32'
|
||||
stop
|
||||
else
|
||||
array_size = INT(big_size,kind=StreamIntKind)
|
||||
endif
|
||||
end block
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! number of iterations
|
||||
!
|
||||
pos(1) = index(argtmp,"--numtimes")
|
||||
pos(2) = index(argtmp,"-n")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
if (i+1.gt.argc) then
|
||||
print*,'You failed to provide a value for ',argtmp
|
||||
else
|
||||
call get_command_argument(i+1,argtmp,arglen,err)
|
||||
read(argtmp,'(i15)') num_times
|
||||
if (num_times.lt.2) then
|
||||
write(*,'(a)') "Number of times must be 2 or more"
|
||||
stop
|
||||
end if
|
||||
endif
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! precision
|
||||
!
|
||||
pos(1) = index(argtmp,"--float")
|
||||
if (pos(1).eq.1) then
|
||||
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
|
||||
"to run BabelStream in single precision."
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! selection (All, Triad, Nstream)
|
||||
!
|
||||
pos(1) = index(argtmp,"--triad-only")
|
||||
if (pos(1).eq.1) then
|
||||
selection = 2
|
||||
cycle
|
||||
endif
|
||||
pos(1) = index(argtmp,"--nstream-only")
|
||||
if (pos(1).eq.1) then
|
||||
selection = 3
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! CSV
|
||||
!
|
||||
pos(1) = index(argtmp,"--csv")
|
||||
if (pos(1).eq.1) then
|
||||
csv = .true.
|
||||
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
|
||||
!stop
|
||||
endif
|
||||
!
|
||||
! units
|
||||
!
|
||||
pos(1) = index(argtmp,"--mibibytes")
|
||||
if (pos(1).eq.1) then
|
||||
mibibytes = .true.
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
! giga/gibi instead of mega/mebi
|
||||
!
|
||||
pos(1) = index(argtmp,"--gigs")
|
||||
if (pos(1).eq.1) then
|
||||
use_gigs = .true.
|
||||
cycle
|
||||
endif
|
||||
!
|
||||
!
|
||||
!
|
||||
pos(1) = index(argtmp,"--compiler-info")
|
||||
if (pos(1).eq.1) then
|
||||
write(*,'(a)') 'Compiler version: ',compiler_version()
|
||||
write(*,'(a)') 'Compiler options: ',compiler_options()
|
||||
stop
|
||||
endif
|
||||
!
|
||||
! help
|
||||
!
|
||||
pos(1) = index(argtmp,"--help")
|
||||
pos(2) = index(argtmp,"-h")
|
||||
if (any(pos(:).eq.1) ) then
|
||||
call get_command_argument(0,argtmp,arglen,err)
|
||||
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
|
||||
write(*,'(a)') "Options:"
|
||||
write(*,'(a)') " -h --help Print the message"
|
||||
write(*,'(a)') " --list List available devices"
|
||||
write(*,'(a)') " --device INDEX Select device at INDEX"
|
||||
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
|
||||
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
|
||||
!write(*,'(a)') " --float Use floats (rather than doubles)"
|
||||
write(*,'(a)') " --triad-only Only run triad"
|
||||
write(*,'(a)') " --nstream-only Only run nstream"
|
||||
write(*,'(a)') " --csv Output as csv table"
|
||||
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
|
||||
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
|
||||
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
|
||||
stop
|
||||
endif
|
||||
end if
|
||||
end do
|
||||
end subroutine parseArguments
|
||||
|
||||
subroutine run_all(timings, summ)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64), intent(out) :: summ
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call copy()
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call mul(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(2,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call add()
|
||||
t2 = get_wtime()
|
||||
timings(3,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
call triad(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(4,i) = t2-t1
|
||||
|
||||
t1 = get_wtime()
|
||||
summ = dot()
|
||||
t2 = get_wtime()
|
||||
timings(5,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_all
|
||||
|
||||
subroutine run_triad(timings)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call triad(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_triad
|
||||
|
||||
subroutine run_nstream(timings)
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
real(kind=REAL64), intent(inout) :: timings(:,:)
|
||||
real(kind=REAL64) :: t1, t2
|
||||
integer(kind=StreamIntKind) :: i
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
t1 = get_wtime()
|
||||
call nstream(startScalar)
|
||||
t2 = get_wtime()
|
||||
timings(1,i) = t2-t1
|
||||
|
||||
end do
|
||||
|
||||
end subroutine run_nstream
|
||||
|
||||
subroutine check_solution(A, B, C, summ)
|
||||
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
|
||||
implicit none
|
||||
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
|
||||
real(kind=REAL64), intent(in) :: summ
|
||||
|
||||
integer(kind=StreamIntKind) :: i
|
||||
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
|
||||
real(kind=REAL64) :: scalar
|
||||
|
||||
! always use double because of accumulation error
|
||||
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
|
||||
logical :: cleanA, cleanB, cleanC, cleanSum
|
||||
|
||||
goldA = startA
|
||||
goldB = startB
|
||||
goldC = startC
|
||||
goldSum = 0.0d0
|
||||
|
||||
scalar = startScalar
|
||||
|
||||
do i=1,num_times
|
||||
|
||||
if (selection.eq.1) then
|
||||
goldC = goldA
|
||||
goldB = scalar * goldC
|
||||
goldC = goldA + goldB
|
||||
goldA = goldB + scalar * goldC
|
||||
else if (selection.eq.2) then
|
||||
goldA = goldB + scalar * goldC
|
||||
else if (selection.eq.3) then
|
||||
goldA = goldA + goldB + scalar * goldC;
|
||||
endif
|
||||
|
||||
end do
|
||||
|
||||
goldSum = goldA * goldB * array_size
|
||||
|
||||
cleanA = ALL(IEEE_Is_Normal(A))
|
||||
cleanB = ALL(IEEE_Is_Normal(B))
|
||||
cleanC = ALL(IEEE_Is_Normal(C))
|
||||
cleanSum = IEEE_Is_Normal(summ)
|
||||
|
||||
if (.not. cleanA) then
|
||||
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanB) then
|
||||
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanC) then
|
||||
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
|
||||
end if
|
||||
if (.not. cleanSum) then
|
||||
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
|
||||
end if
|
||||
|
||||
errA = SUM( ABS( A - goldA ) ) / array_size
|
||||
errB = SUM( ABS( B - goldB ) ) / array_size
|
||||
errC = SUM( ABS( C - goldC ) ) / array_size
|
||||
errSum = ABS( (summ - goldSum) / goldSum)
|
||||
|
||||
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
|
||||
|
||||
if (errA .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
|
||||
end if
|
||||
if (errB .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
|
||||
end if
|
||||
if (errC .gt. epsi) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
|
||||
end if
|
||||
|
||||
if (selection.eq.1) then
|
||||
if (errSum .gt. 1.0e-8) then
|
||||
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
|
||||
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
|
||||
end if
|
||||
endif
|
||||
|
||||
end subroutine check_solution
|
||||
|
||||
end module BabelStreamUtil
|
||||
|
||||
program BabelStream
|
||||
use BabelStreamUtil
|
||||
#if defined(USE_DOCONCURRENT)
|
||||
use DoConcurrentStream
|
||||
#elif defined(USE_ARRAY)
|
||||
use ArrayStream
|
||||
#elif defined(USE_OPENMP)
|
||||
use OpenMPStream
|
||||
#elif defined(USE_OPENMPWORKSHARE)
|
||||
use OpenMPWorkshareStream
|
||||
#elif defined(USE_OPENMPTARGET)
|
||||
use OpenMPTargetStream
|
||||
#elif defined(USE_OPENMPTARGETLOOP)
|
||||
use OpenMPTargetLoopStream
|
||||
#elif defined(USE_OPENMPTASKLOOP)
|
||||
use OpenMPTaskloopStream
|
||||
#elif defined(USE_OPENACC)
|
||||
use OpenACCStream
|
||||
#elif defined(USE_OPENACCARRAY)
|
||||
use OpenACCArrayStream
|
||||
#elif defined(USE_CUDA)
|
||||
use CUDAStream
|
||||
#elif defined(USE_CUDAKERNEL)
|
||||
use CUDAKernelStream
|
||||
#elif defined(USE_SEQUENTIAL)
|
||||
use SequentialStream
|
||||
#endif
|
||||
implicit none
|
||||
integer :: element_size, err
|
||||
real(kind=REAL64) :: scaling
|
||||
character(len=3) :: label
|
||||
real(kind=REAL64), allocatable :: timings(:,:)
|
||||
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
|
||||
real(kind=REAL64) :: summ
|
||||
|
||||
call parseArguments()
|
||||
|
||||
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
|
||||
|
||||
if (mibibytes) then
|
||||
if (use_gigs) then
|
||||
scaling = 2.0d0**(-30)
|
||||
label = "GiB"
|
||||
else
|
||||
scaling = 2.0d0**(-20)
|
||||
label = "MiB"
|
||||
endif
|
||||
else
|
||||
if (use_gigs) then
|
||||
scaling = 1.0d-9
|
||||
label = "GB"
|
||||
else
|
||||
scaling = 1.0d-6
|
||||
label = "MB"
|
||||
endif
|
||||
endif
|
||||
|
||||
if (.not.csv) then
|
||||
|
||||
write(*,'(a)') "BabelStream Fortran"
|
||||
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
|
||||
write(*,'(a16,a)') "Implementation: ", implementation_name
|
||||
|
||||
block
|
||||
character(len=32) :: printout
|
||||
write(printout,'(i9,1x,a5)') num_times,'times'
|
||||
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
|
||||
end block
|
||||
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
|
||||
|
||||
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
|
||||
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
|
||||
|
||||
endif ! csv
|
||||
|
||||
allocate( timings(5,num_times) )
|
||||
|
||||
call alloc(array_size)
|
||||
|
||||
call init_arrays(startA, startB, startC)
|
||||
summ = 0.0d0
|
||||
|
||||
timings = -1.0d0
|
||||
if (selection.eq.1) then
|
||||
call run_all(timings, summ)
|
||||
else if (selection.eq.2) then
|
||||
call run_triad(timings)
|
||||
else if (selection.eq.3) then
|
||||
call run_nstream(timings)
|
||||
endif
|
||||
|
||||
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
|
||||
if (err .ne. 0) then
|
||||
write(*,'(a20,i3)') 'allocate returned ',err
|
||||
stop 1
|
||||
endif
|
||||
|
||||
call read_arrays(h_A, h_B, h_C)
|
||||
call check_solution(h_A, h_B, h_C, summ)
|
||||
|
||||
block
|
||||
character(len=20) :: printout(8)
|
||||
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
|
||||
|
||||
if (csv) then
|
||||
write(*,'(a,a1)',advance='no') 'function', csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
|
||||
if (mibibytes) then
|
||||
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
|
||||
else
|
||||
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
|
||||
endif
|
||||
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
|
||||
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
|
||||
write(*,'(a,a1)',advance='yes') 'avg_runtime'
|
||||
else
|
||||
write(printout(1),'(a8)') 'Function'
|
||||
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
|
||||
write(printout(3),'(a9)') 'Min (sec)'
|
||||
write(printout(4),'(a3)') 'Max'
|
||||
write(printout(5),'(a7)') 'Average'
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif ! csv
|
||||
|
||||
if (selection.eq.1) then
|
||||
block
|
||||
integer, parameter :: sizes(5) = [2,2,3,3,2]
|
||||
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
|
||||
integer :: i
|
||||
do i=1,5
|
||||
tmin = MINVAL(timings(i,2:num_times))
|
||||
tmax = MAXVAL(timings(i,2:num_times))
|
||||
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
|
||||
write(printout(1),'(a)') labels(i)
|
||||
if (csv) then
|
||||
write(printout(2),'(i20)') num_times
|
||||
write(printout(3),'(i20)') array_size
|
||||
write(printout(4),'(i20)') element_size
|
||||
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
|
||||
write(printout(6),'(f20.8)') tmin
|
||||
write(printout(7),'(f20.8)') tmax
|
||||
write(printout(8),'(f20.8)') tavg
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
|
||||
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
|
||||
else
|
||||
write(printout(2),'(f12.3)') scaling*nbytes/tmin
|
||||
write(printout(3),'(f12.5)') tmin
|
||||
write(printout(4),'(f12.5)') tmax
|
||||
write(printout(5),'(f12.5)') tavg
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif
|
||||
enddo
|
||||
end block
|
||||
else if ((selection.eq.2).or.(selection.eq.3)) then
|
||||
tmin = MINVAL(timings(1,2:num_times))
|
||||
tmax = MAXVAL(timings(1,2:num_times))
|
||||
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
|
||||
if (selection.eq.2) then
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
|
||||
write(printout(1),'(a12)') "Triad"
|
||||
else if (selection.eq.3) then
|
||||
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
|
||||
write(printout(1),'(a12)') "Nstream"
|
||||
endif
|
||||
if (csv) then
|
||||
write(printout(2),'(i20)') num_times
|
||||
write(printout(3),'(i20)') array_size
|
||||
write(printout(4),'(i20)') element_size
|
||||
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
|
||||
write(printout(6),'(f20.8)') tmin
|
||||
write(printout(7),'(f20.8)') tmax
|
||||
write(printout(8),'(f20.8)') tavg
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
|
||||
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
|
||||
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
|
||||
else
|
||||
write(printout(2),'(f12.3)') scaling*nbytes/tmin
|
||||
write(printout(3),'(f12.5)') tmin
|
||||
write(printout(4),'(f12.5)') tmax
|
||||
write(printout(5),'(f12.5)') tavg
|
||||
write(*,'(5a12)') ADJUSTL(printout(1:5))
|
||||
endif
|
||||
endif
|
||||
end block
|
||||
|
||||
call dealloc()
|
||||
|
||||
end program BabelStream
|
||||
25
src/fortran/make.inc.amd
Normal file
25
src/fortran/make.inc.amd
Normal file
@ -0,0 +1,25 @@
|
||||
FC := /opt/rocm/llvm/bin/flang
|
||||
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
|
||||
FCFLAGS := -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-variable
|
||||
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
|
||||
ARRAY_FLAG = -fopenmp # libomp.so required
|
||||
OPENMP_FLAG = -fopenmp
|
||||
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
39
src/fortran/make.inc.arm
Normal file
39
src/fortran/make.inc.arm
Normal file
@ -0,0 +1,39 @@
|
||||
FC = armflang
|
||||
FCFLAGS = -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-variable
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
|
||||
ARCH=$(shell uname -m)
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifdef MCPU
|
||||
FCFLAGS += -mcpu=$(MCPU)
|
||||
else
|
||||
FCFLAGS += -mcpu=native
|
||||
endif
|
||||
else
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -fopenmp
|
||||
ARRAY_FLAG = -fopenmp
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
18
src/fortran/make.inc.cray
Normal file
18
src/fortran/make.inc.cray
Normal file
@ -0,0 +1,18 @@
|
||||
FC := ftn
|
||||
FCFLAGS = -e F -O3
|
||||
|
||||
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
|
||||
ARRAY_FLAG = -h autothread
|
||||
OPENMP_FLAG = -h omp
|
||||
OPENACC_FLAG = -h acc
|
||||
# CPU only
|
||||
OPENACC_FLAG += -h omp
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
21
src/fortran/make.inc.fj
Normal file
21
src/fortran/make.inc.fj
Normal file
@ -0,0 +1,21 @@
|
||||
FC := frt
|
||||
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
|
||||
|
||||
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
|
||||
ARRAY_FLAG = -Kparallel,reduction
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG =
|
||||
# CPU only
|
||||
OPENACC_FLAG +=
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OPENACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
33
src/fortran/make.inc.gcc
Normal file
33
src/fortran/make.inc.gcc
Normal file
@ -0,0 +1,33 @@
|
||||
FC = gfortran
|
||||
FCFLAGS = -std=f2018 -O3
|
||||
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
|
||||
ARCH=$(shell uname -m)
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifdef MCPU
|
||||
FCFLAGS += -mcpu=$(MCPU)
|
||||
else
|
||||
FCFLAGS += -mcpu=native
|
||||
endif
|
||||
else
|
||||
ifdef MARCH
|
||||
FCFLAGS += -march=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
|
||||
ARRAY_FLAG =
|
||||
OPENMP_FLAG = -fopenmp
|
||||
OPENACC_FLAG = -fopenacc
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
70
src/fortran/make.inc.nvhpc
Normal file
70
src/fortran/make.inc.nvhpc
Normal file
@ -0,0 +1,70 @@
|
||||
FC := nvfortran
|
||||
#FCFLAGS := -O3 -Minform=inform -Minfo=all
|
||||
FCFLAGS := -O3 -Minform=warn
|
||||
|
||||
#TARGET=gpu
|
||||
TARGET=multicore
|
||||
|
||||
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
|
||||
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
|
||||
$(info Ampere detected)
|
||||
GPU = cc80
|
||||
endif
|
||||
ifeq ($(findstring Turing,$(NVARCH)),Turing)
|
||||
$(info Turing detected)
|
||||
GPU = cc75
|
||||
endif
|
||||
ifeq ($(findstring Volta,$(NVARCH)),Volta)
|
||||
$(info Volta detected)
|
||||
GPU = cc70
|
||||
endif
|
||||
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
|
||||
$(info Pascal detected)
|
||||
GPU = cc60,cc61
|
||||
endif
|
||||
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
|
||||
$(info Jetson AGX Orin detected)
|
||||
GPU = ccn87,cc86
|
||||
# figure out Xavier later
|
||||
#GPU = cc72
|
||||
endif
|
||||
ifeq ($(GPU),)
|
||||
$(error Your GPU architecture could not be detected. Set it manually.)
|
||||
endif
|
||||
GPUFLAG = -gpu=$(GPU)
|
||||
|
||||
# MARCH=neoverse-v1,neoverse-n1,zen3
|
||||
ARCH=$(shell uname -m)
|
||||
ifdef MARCH
|
||||
ifeq ($(ARCH),aarch64)
|
||||
ifeq ($(MARCH),neoverse-n1)
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
else
|
||||
ifeq ($(MARCH),neoverse-v1)
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
else
|
||||
FCFLAGS += -tp=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCFLAGS += -tp=$(MARCH)
|
||||
endif
|
||||
else
|
||||
FCFLAGS += -tp=native
|
||||
endif
|
||||
|
||||
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
|
||||
# set exactly one of these!
|
||||
#MANAGED = -DUSE_MANAGED -gpu=managed
|
||||
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
|
||||
|
||||
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
|
||||
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
|
||||
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
|
||||
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
|
||||
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
|
||||
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
|
||||
endif
|
||||
32
src/fortran/make.inc.oneapi
Normal file
32
src/fortran/make.inc.oneapi
Normal file
@ -0,0 +1,32 @@
|
||||
FC := ifx
|
||||
FCFLAGS = -std18
|
||||
FCFLAGS += -Ofast -xHOST
|
||||
FCFLAGS += -qopt-zmm-usage=low
|
||||
|
||||
ifeq ($(FC),ifort)
|
||||
FCFLAGS += -qopt-streaming-stores=always
|
||||
PARALLEL = -parallel
|
||||
endif
|
||||
|
||||
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
|
||||
ARRAY_FLAG = -qopenmp $(PARALLEL)
|
||||
OPENMP_FLAG = -qopenmp
|
||||
ifeq ($(FC),ifx)
|
||||
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
|
||||
endif
|
||||
OPENACC_FLAG =
|
||||
CUDA_FLAG =
|
||||
SEQUENTIAL_FLAG =
|
||||
|
||||
ifeq ($(IMPLEMENTATION),OpenACC)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),OpenACCArray)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDA)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
ifeq ($(IMPLEMENTATION),CUDAKernels)
|
||||
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
|
||||
endif
|
||||
35
src/fortran/run.sh
Executable file
35
src/fortran/run.sh
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat ./run.sh
|
||||
|
||||
if [ `uname -s` == Darwin ] ; then
|
||||
NUM_HWTHREADS=`sysctl -n hw.ncpu`
|
||||
MEMORY_BYTES=`sysctl -n hw.memsize`
|
||||
else
|
||||
NUM_HWTHREADS=`nproc`
|
||||
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
|
||||
fi
|
||||
|
||||
M=128
|
||||
|
||||
export OMP_NUM_THREADS=8
|
||||
export OMP_PROC_BIND=close
|
||||
export OMP_PLACES=threads
|
||||
|
||||
export ACC_NUM_CORES=${OMP_NUM_THREADS}
|
||||
|
||||
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
|
||||
|
||||
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
|
||||
#if [ "x$compiler" == "xgcc" ] ; then
|
||||
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
|
||||
#fi
|
||||
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
|
||||
if [ -f BabelStream.${compiler}.${implementation} ] ; then
|
||||
echo "BabelStream.${compiler}.${implementation}"
|
||||
ldd BabelStream.${compiler}.${implementation}
|
||||
time $AFFCONTROL \
|
||||
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
|
||||
fi
|
||||
done
|
||||
done
|
||||
@ -9,7 +9,7 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
|
||||
|
||||
void check_error(void)
|
||||
{
|
||||
@ -47,9 +47,15 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
|
||||
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
|
||||
|
||||
array_size = ARRAY_SIZE;
|
||||
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
|
||||
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
|
||||
|
||||
// Allocate the host array for partial sums for dot kernels
|
||||
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
|
||||
// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
|
||||
// This creates an array on the host which is visible to the device. However, it requires
|
||||
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
|
||||
// after it has been passed through to a kernel.
|
||||
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
|
||||
check_error();
|
||||
|
||||
// Check buffers fit on the device
|
||||
hipDeviceProp_t props;
|
||||
@ -64,15 +70,14 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
|
||||
check_error();
|
||||
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
||||
check_error();
|
||||
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
HIPStream<T>::~HIPStream()
|
||||
{
|
||||
free(sums);
|
||||
hipHostFree(sums);
|
||||
check_error();
|
||||
|
||||
hipFree(d_a);
|
||||
check_error();
|
||||
@ -80,15 +85,13 @@ HIPStream<T>::~HIPStream()
|
||||
check_error();
|
||||
hipFree(d_c);
|
||||
check_error();
|
||||
hipFree(d_sum);
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
@ -97,7 +100,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
template <class T>
|
||||
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
|
||||
init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -115,18 +118,17 @@ void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void copy_kernel(const T * a, T * c)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
c[i] = a[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::copy()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
|
||||
copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -136,14 +138,14 @@ template <typename T>
|
||||
__global__ void mul_kernel(T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
b[i] = scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::mul()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
|
||||
mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -152,14 +154,14 @@ void HIPStream<T>::mul()
|
||||
template <typename T>
|
||||
__global__ void add_kernel(const T * a, const T * b, T * c)
|
||||
{
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::add()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -169,14 +171,14 @@ template <typename T>
|
||||
__global__ void triad_kernel(T * a, const T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::triad()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
@ -186,32 +188,32 @@ template <typename T>
|
||||
__global__ void nstream_kernel(T * a, const T * b, const T * c)
|
||||
{
|
||||
const T scalar = startScalar;
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
a[i] += b[i] + scalar * c[i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::nstream()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
|
||||
nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
|
||||
check_error();
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
{
|
||||
__shared__ T tb_sum[TBSIZE];
|
||||
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t local_i = hipThreadIdx_x;
|
||||
const size_t local_i = threadIdx.x;
|
||||
size_t i = blockDim.x * blockIdx.x + local_i;
|
||||
|
||||
tb_sum[local_i] = 0.0;
|
||||
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
|
||||
for (; i < array_size; i += blockDim.x*gridDim.x)
|
||||
tb_sum[local_i] += a[i] * b[i];
|
||||
|
||||
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
|
||||
for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
|
||||
{
|
||||
__syncthreads();
|
||||
if (local_i < offset)
|
||||
@ -221,20 +223,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
|
||||
}
|
||||
|
||||
if (local_i == 0)
|
||||
sum[hipBlockIdx_x] = tb_sum[local_i];
|
||||
sum[blockIdx.x] = tb_sum[local_i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T HIPStream<T>::dot()
|
||||
{
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
|
||||
dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
|
||||
check_error();
|
||||
|
||||
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
|
||||
T sum = 0.0;
|
||||
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
|
||||
for (int i = 0; i < dot_num_blocks; i++)
|
||||
sum += sums[i];
|
||||
|
||||
return sum;
|
||||
|
||||
@ -14,13 +14,31 @@
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "HIP"
|
||||
#define DOT_READ_DWORDS_PER_LANE 4
|
||||
|
||||
|
||||
template <class T>
|
||||
class HIPStream : public Stream<T>
|
||||
{
|
||||
// Make sure that either:
|
||||
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
|
||||
// or
|
||||
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
|
||||
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
|
||||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
|
||||
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");
|
||||
|
||||
// Take into account the datatype size
|
||||
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
|
||||
// and 4 FP32 elements
|
||||
static constexpr unsigned int dot_elements_per_lane{
|
||||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
|
||||
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};
|
||||
|
||||
protected:
|
||||
// Size of arrays
|
||||
int array_size;
|
||||
int dot_num_blocks;
|
||||
|
||||
// Host array for partial sums for dot kernel
|
||||
T *sums;
|
||||
@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
|
||||
T *d_a;
|
||||
T *d_b;
|
||||
T *d_c;
|
||||
T *d_sum;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
|
||||
// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin
|
||||
// University of Bristol HPC
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
|
||||
@ -10,7 +10,6 @@
|
||||
#include <stdexcept>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "Kokkos"
|
||||
|
||||
@ -1,32 +1,38 @@
|
||||
|
||||
register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection and RAJA.
|
||||
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
|
||||
"c++")
|
||||
|
||||
register_flag_required(KOKKOS_IN_TREE
|
||||
register_flag_optional(KOKKOS_IN_TREE
|
||||
"Absolute path to the *source* distribution directory of Kokkos.
|
||||
Remember to append Kokkos specific flags as well, for example:
|
||||
|
||||
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
|
||||
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
|
||||
|
||||
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
|
||||
register_flag_optional(KOKKOS_IN_PACKAGE
|
||||
"Absolute path to package R-Path containing Kokkos libs.
|
||||
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
|
||||
|
||||
# compiler vendor and arch specific flags
|
||||
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
|
||||
|
||||
macro(setup)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
|
||||
cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
|
||||
|
||||
message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
|
||||
|
||||
if (EXISTS "${KOKKOS_IN_TREE}")
|
||||
message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
|
||||
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
|
||||
register_link_library(Kokkos::kokkos)
|
||||
else ()
|
||||
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
|
||||
elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
|
||||
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
|
||||
set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
|
||||
find_package(Kokkos REQUIRED)
|
||||
register_link_library(Kokkos::kokkos)
|
||||
else()
|
||||
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
|
||||
endif ()
|
||||
|
||||
register_append_compiler_and_arch_specific_cxx_flags(
|
||||
@ -36,5 +42,3 @@ macro(setup)
|
||||
)
|
||||
|
||||
endmacro()
|
||||
|
||||
|
||||
|
||||
@ -1,18 +1,19 @@
|
||||
|
||||
register_flag_optional(CMAKE_CXX_COMPILER
|
||||
"Any CXX compiler that is supported by CMake detection and RAJA.
|
||||
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
|
||||
"c++")
|
||||
|
||||
register_flag_required(RAJA_IN_TREE
|
||||
register_flag_optional(RAJA_IN_TREE
|
||||
"Absolute path to the *source* distribution directory of RAJA.
|
||||
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
|
||||
Remember to append RAJA specific flags as well, for example:
|
||||
|
||||
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
|
||||
|
||||
See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options
|
||||
")
|
||||
" "")
|
||||
|
||||
register_flag_optional(RAJA_IN_PACKAGE
|
||||
"Use if Raja is part of a package dependency:
|
||||
Path to installation" "")
|
||||
|
||||
register_flag_optional(TARGET
|
||||
"Target offload device, implemented values are CPU, NVIDIA"
|
||||
@ -76,16 +77,22 @@ macro(setup)
|
||||
register_link_library(RAJA)
|
||||
# RAJA's cmake screws with where the binary will end up, resetting it here:
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||
|
||||
elseif (EXISTS "${RAJA_IN_PACKAGE}")
|
||||
message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`")
|
||||
find_package(RAJA REQUIRED)
|
||||
register_link_library(RAJA)
|
||||
|
||||
else ()
|
||||
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
|
||||
message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists")
|
||||
endif ()
|
||||
|
||||
|
||||
if (ENABLE_CUDA)
|
||||
# RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
|
||||
enable_language(CUDA)
|
||||
set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
|
||||
set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
|
||||
set_source_files_properties(src/raja/RAJAStream.cpp PROPERTIES LANGUAGE CUDA)
|
||||
set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
|
||||
endif ()
|
||||
|
||||
|
||||
|
||||
668
src/rust/rust-stream/Cargo.lock
generated
668
src/rust/rust-stream/Cargo.lock
generated
@ -4,13 +4,131 @@ version = 3
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.11.0"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-attributes"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"event-listener",
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-executor"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965"
|
||||
dependencies = [
|
||||
"async-task",
|
||||
"concurrent-queue",
|
||||
"fastrand",
|
||||
"futures-lite",
|
||||
"once_cell",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-global-executor"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca"
|
||||
dependencies = [
|
||||
"async-channel",
|
||||
"async-executor",
|
||||
"async-io",
|
||||
"async-lock",
|
||||
"blocking",
|
||||
"futures-lite",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-io"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"concurrent-queue",
|
||||
"futures-lite",
|
||||
"libc",
|
||||
"log",
|
||||
"once_cell",
|
||||
"parking",
|
||||
"polling",
|
||||
"slab",
|
||||
"socket2",
|
||||
"waker-fn",
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-lock"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6"
|
||||
dependencies = [
|
||||
"event-listener",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-std"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d"
|
||||
dependencies = [
|
||||
"async-attributes",
|
||||
"async-channel",
|
||||
"async-global-executor",
|
||||
"async-io",
|
||||
"async-lock",
|
||||
"crossbeam-utils",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-lite",
|
||||
"gloo-timers",
|
||||
"kv-log-macro",
|
||||
"log",
|
||||
"memchr",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
"wasm-bindgen-futures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-task"
|
||||
version = "4.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524"
|
||||
|
||||
[[package]]
|
||||
name = "atomic-waker"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a"
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
@ -24,9 +142,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.1"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
@ -34,6 +152,38 @@ version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "blocking"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc"
|
||||
dependencies = [
|
||||
"async-channel",
|
||||
"async-task",
|
||||
"atomic-waker",
|
||||
"fastrand",
|
||||
"futures-lite",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d"
|
||||
|
||||
[[package]]
|
||||
name = "cache-padded"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.73"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.3"
|
||||
version = "2.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"atty",
|
||||
@ -64,6 +214,15 @@ dependencies = [
|
||||
"crossterm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "1.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c"
|
||||
dependencies = [
|
||||
"cache-padded",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core_affinity"
|
||||
version = "0.5.10"
|
||||
@ -78,9 +237,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam"
|
||||
version = "0.8.1"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845"
|
||||
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-channel",
|
||||
@ -92,9 +251,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.1"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
@ -102,9 +261,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.1"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
|
||||
checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
@ -113,22 +272,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.5"
|
||||
version = "0.9.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
|
||||
checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"memoffset",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-queue"
|
||||
version = "0.3.2"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9"
|
||||
checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
@ -136,12 +295,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.5"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
|
||||
checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -170,10 +328,157 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.6.1"
|
||||
name = "ctor"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "2.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
|
||||
dependencies = [
|
||||
"instant",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68"
|
||||
|
||||
[[package]]
|
||||
name = "futures-lite"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"memchr",
|
||||
"parking",
|
||||
"pin-project-lite",
|
||||
"waker-fn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
version = "3.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gloo-timers"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
@ -202,6 +507,15 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kernel32-sys"
|
||||
version = "0.2.2"
|
||||
@ -212,6 +526,15 @@ dependencies = [
|
||||
"winapi-build",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kv-log-macro"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
|
||||
dependencies = [
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.108"
|
||||
version = "0.2.134"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119"
|
||||
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.5"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109"
|
||||
checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.14"
|
||||
version = "0.4.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
|
||||
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"value-bag",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.6.4"
|
||||
name = "memchr"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.6.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
@ -275,32 +606,44 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ntapi"
|
||||
version = "0.3.6"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
|
||||
checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f"
|
||||
dependencies = [
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.14"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
|
||||
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.13.0"
|
||||
version = "1.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||
checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
@ -327,12 +670,29 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest"
|
||||
version = "2.1.3"
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
|
||||
|
||||
[[package]]
|
||||
name = "pin-utils"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "polling"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011"
|
||||
dependencies = [
|
||||
"ucd-trie",
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"log",
|
||||
"wepoll-ffi",
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -361,27 +721,27 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.32"
|
||||
version = "1.0.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43"
|
||||
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
|
||||
dependencies = [
|
||||
"unicode-xid",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.10"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05"
|
||||
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.5.1"
|
||||
version = "1.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
|
||||
checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"crossbeam-deque",
|
||||
@ -391,31 +751,43 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.9.1"
|
||||
version = "1.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
|
||||
checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"num_cpus",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.10"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstest"
|
||||
version = "0.10.0"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af"
|
||||
checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc"
|
||||
dependencies = [
|
||||
"async-std",
|
||||
"futures",
|
||||
"futures-timer",
|
||||
"rstest_macros",
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstest_macros"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"proc-macro2",
|
||||
@ -426,7 +798,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rust-stream"
|
||||
version = "3.4.0"
|
||||
version = "4.0.0"
|
||||
dependencies = [
|
||||
"colour",
|
||||
"core_affinity",
|
||||
@ -443,18 +815,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.3.3"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee"
|
||||
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.5"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
|
||||
checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "0.11.0"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
|
||||
dependencies = [
|
||||
"semver-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver-parser"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
|
||||
dependencies = [
|
||||
"pest",
|
||||
]
|
||||
checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
@ -501,10 +861,29 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.7.0"
|
||||
name = "slab"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309"
|
||||
checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||
|
||||
[[package]]
|
||||
name = "structopt"
|
||||
version = "0.3.25"
|
||||
version = "0.3.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c"
|
||||
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"lazy_static",
|
||||
@ -538,20 +917,20 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.82"
|
||||
version = "1.0.101"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59"
|
||||
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabular"
|
||||
version = "0.1.4"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b"
|
||||
checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
@ -566,28 +945,32 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ucd-trie"
|
||||
version = "0.1.3"
|
||||
name = "unicode-ident"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.8.0"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b"
|
||||
checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.9"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
|
||||
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.2"
|
||||
name = "value-bag"
|
||||
version = "1.0.0-alpha.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||
checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
|
||||
dependencies = [
|
||||
"ctor",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vec_map"
|
||||
@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.3"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "waker-fn"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"wasm-bindgen-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f"
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wepoll-ffi"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
|
||||
@ -7,19 +7,19 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
num-traits = "0.2.14"
|
||||
structopt = "0.3.13"
|
||||
tabular = "0.1.4"
|
||||
rayon = "1.5.1"
|
||||
crossbeam = "0.8.1"
|
||||
num_cpus = "1.13.0"
|
||||
rustversion = "1.0"
|
||||
libc = "0.2.97"
|
||||
num-traits = "0.2.15"
|
||||
structopt = "0.3.26"
|
||||
tabular = "0.2.0"
|
||||
rayon = "1.5.3"
|
||||
crossbeam = "0.8.2"
|
||||
num_cpus = "1.13.1"
|
||||
rustversion = "1.0.9"
|
||||
libc = "0.2.134"
|
||||
core_affinity = "0.5.10"
|
||||
colour = "0.6.0"
|
||||
|
||||
[dev-dependencies]
|
||||
rstest = "0.10.0"
|
||||
rstest = "0.13.0"
|
||||
|
||||
[build-dependencies]
|
||||
rustversion = "1.0"
|
||||
|
||||
@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER
|
||||
"Compile using the specified SYCL compiler implementation
|
||||
Supported values are
|
||||
ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
|
||||
ONEAPI-ICPX - icpx as a standalone compiler
|
||||
DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm)
|
||||
HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
|
||||
COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
|
||||
@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER
|
||||
register_flag_optional(SYCL_COMPILER_DIR
|
||||
"Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
|
||||
ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first)
|
||||
ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
|
||||
HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
|
||||
"")
|
||||
|
||||
@ -65,6 +67,12 @@ macro(setup)
|
||||
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP")
|
||||
set(CMAKE_CXX_COMPILER dpcpp)
|
||||
register_definitions(CL_TARGET_OPENCL_VERSION=220)
|
||||
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
|
||||
set(CMAKE_CXX_COMPILER icpx)
|
||||
include_directories(${SYCL_COMPILER_DIR}/include/sycl)
|
||||
register_definitions(CL_TARGET_OPENCL_VERSION=220)
|
||||
register_append_cxx_flags(ANY -fsycl)
|
||||
register_append_link_flags(-fsycl)
|
||||
else ()
|
||||
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
|
||||
endif ()
|
||||
|
||||
@ -145,7 +145,7 @@ T ThrustStream<T>::dot()
|
||||
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
|
||||
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
|
||||
|
||||
#ifdef __NVCC__
|
||||
#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__)
|
||||
#define IMPL_FN__(fn) cuda ## fn
|
||||
#define IMPL_TYPE__(tpe) cuda ## tpe
|
||||
#elif defined(__HIP_PLATFORM_HCC__)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user