Merge pull request #135 from jeffhammond/fortran-ports

Fortran ports
This commit is contained in:
Tom Deakin 2022-11-23 00:01:04 +00:00 committed by GitHub
commit 0df3ae79be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 3102 additions and 2 deletions

6
.gitignore vendored
View File

@ -10,12 +10,18 @@ sycl-stream
hip-stream
tbb-stream
src/fortran/BabelStream
src/fortran/BabelStream.*
*.o
*.bc
*.sycl
*.tar
*.gz
*.a
*.mod
*.cub
*.ptx
KokkosCore_config.*

4
src/.gitignore vendored
View File

@ -16,6 +16,8 @@
**/*.gz
**/*.a
**/*.swp
**/KokkosCore_Config_*
**/.DS_Store
@ -26,4 +28,4 @@ cmake-build-*/
CMakeFiles/
.idea/
.vscode/
.directory
.directory

105
src/fortran/ArrayStream.F90 Normal file
View File

@ -0,0 +1,105 @@
module ArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=5), parameter :: implementation_name = "Array"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
A = initA
B = initB
C = initC
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
h_A = A
h_B = B
h_C = C
end subroutine read_arrays
subroutine copy()
implicit none
C = A
end subroutine copy
subroutine add()
implicit none
C = A + B
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
B = scalar * C
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = B + scalar * C
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
A = A + B + scalar * C
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
s = dot_product(A,B)
end function dot
end module ArrayStream

View File

@ -0,0 +1,21 @@
module BabelStreamTypes
use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
implicit none
#ifdef USE_FLOAT
integer, parameter :: StreamRealKind = REAL32
character(len=6) :: StreamRealName = "REAL32"
#else
integer, parameter :: StreamRealKind = REAL64
character(len=6) :: StreamRealName = "REAL64"
#endif
#ifdef USE_INT32
#warning There is no checking for overflowing INT32, so be careful.
integer, parameter :: StreamIntKind = INT32
#else
integer, parameter :: StreamIntKind = INT64
#endif
end module BabelStreamTypes

View File

@ -0,0 +1,230 @@
module CUDAKernelStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "CUDAKernel"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer(kind=StreamIntKind) :: i
integer :: err
!$cuf kernel do <<< *, * >>>
do i=1,N
C(i) = A(i) + B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
B(i) = scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
integer :: err
scalar = startScalar
!$cuf kernel do <<< *, * >>>
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer(kind=StreamIntKind) :: i
integer :: err
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAKernelStream

309
src/fortran/CUDAStream.F90 Normal file
View File

@ -0,0 +1,309 @@
module CUDAFortranKernels
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
contains
attributes(global) subroutine do_copy(n,A,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i)
endif
end subroutine do_copy
attributes(global) subroutine do_add(n,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
C(i) = A(i) + B(i)
endif
end subroutine do_add
attributes(global) subroutine do_mul(n,scalar,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: B(n)
real(kind=REAL64), intent(in) :: C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
B(i) = scalar * C(i)
endif
end subroutine do_mul
attributes(global) subroutine do_triad(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(out) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = B(i) + scalar * C(i)
endif
end subroutine do_triad
attributes(global) subroutine do_nstream(n,scalar,A,B,C)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in), value :: scalar
real(kind=REAL64), intent(inout) :: A(n)
real(kind=REAL64), intent(in) :: B(n), C(n)
integer(kind=StreamIntKind) :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= N) then
A(i) = A(i) + B(i) + scalar * C(i)
endif
end subroutine do_nstream
#if 0
attributes(global) subroutine do_dot(n,A,B,r)
implicit none
integer(kind=StreamIntKind), intent(in), value :: n
real(kind=REAL64), intent(in) :: A(n), B(n)
real(kind=REAL64), intent(out) :: r
integer(kind=StreamIntKind) :: i
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
end subroutine do_dot
#endif
end module CUDAFortranKernels
module CUDAStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
use cudafor, only: dim3
implicit none
character(len=4), parameter :: implementation_name = "CUDA"
integer(kind=StreamIntKind) :: N
#ifdef USE_MANAGED
real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#endif
type(dim3) :: grid, tblock
contains
subroutine list_devices()
use cudafor
implicit none
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use cudafor
implicit none
integer, intent(in) :: dev
integer :: num, err
err = cudaGetDeviceCount(num)
if (err.ne.0) then
write(*,'(a)') "cudaGetDeviceCount failed"
write(*,'(a)') cudaGetErrorString(err)
stop
else if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.ge.num) then
write(*,'(a21)') "Invalid device index."
stop
else
err = cudaSetDevice(dev)
if (err.ne.0) then
write(*,'(a)') "cudaSetDevice failed"
write(*,'(a)') cudaGetErrorString(err)
stop
end if
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
! move to separate subroutine later
tblock = dim3(128,1,1)
grid = dim3(ceiling(real(N)/tblock%x),1,1)
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
integer :: err
A = initA
B = initB
C = initC
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
integer :: err
h_A = A
h_B = B
h_C = C
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine read_arrays
subroutine copy()
use CUDAFortranKernels, only: do_copy
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_copy<<<grid, tblock>>>(N, A, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine copy
subroutine add()
use CUDAFortranKernels, only: do_add
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
integer :: err
call do_add<<<grid, tblock>>>(N, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine add
subroutine mul(startScalar)
use CUDAFortranKernels, only: do_mul
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_mul<<<grid, tblock>>>(N, scalar, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine mul
subroutine triad(startScalar)
use CUDAFortranKernels, only: do_triad
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine triad
subroutine nstream(startScalar)
use CUDAFortranKernels, only: do_nstream
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer :: err
scalar = startScalar
call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end subroutine nstream
function dot() result(r)
!use CUDAFortranKernels, only: do_dot
use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
implicit none
real(kind=REAL64) :: r
integer :: err
integer(kind=StreamIntKind) :: i
!call do_dot<<<grid, tblock>>>(N, B, C, r)
r = real(0,kind=REAL64)
!$cuf kernel do <<< *, * >>>
do i=1,N
r = r + A(i) * B(i)
end do
err = cudaDeviceSynchronize()
if (err.ne.0) then
write(*,'(a)') "cudaDeviceSynchronize failed"
write(*,'(a)') cudaGetErrorString(err)
stop
endif
end function dot
end module CUDAStream

View File

@ -0,0 +1,139 @@
module DoConcurrentStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "DoConcurrent"
integer(kind=StreamIntKind) :: N
#ifdef USE_DEVICE
real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
#else
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
#endif
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N)
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,C)
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do concurrent (i=1:N) !shared(A,B,C)
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(B,C)
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do concurrent (i=1:N) !shared(A,B,C)
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
! reduction omitted because NVF infers it and other compilers do not support
s = real(0,kind=REAL64)
#ifdef CRAY_THREAD_DOCONCURRENT
do i=1,N
#else
do concurrent (i=1:N) !shared(A,B)
#endif
s = s + A(i) * B(i)
end do
end function dot
end module DoConcurrentStream

109
src/fortran/Makefile Normal file
View File

@ -0,0 +1,109 @@
ifeq ($(COMPILER),nvhpc)
include make.inc.nvhpc
else ifeq ($(COMPILER),oneapi)
include make.inc.oneapi
else ifeq ($(COMPILER),gcc)
include make.inc.gcc
else ifeq ($(COMPILER),amd)
include make.inc.amd
else ifeq ($(COMPILER),arm)
include make.inc.arm
else ifeq ($(COMPILER),cray)
include make.inc.cray
else ifeq ($(COMPILER),fj)
include make.inc.fj
else
$(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
include make.inc.gcc
COMPILER=gcc
endif
FCFLAGS += -DVERSION_STRING="4.0"
#FCFLAGS += -DUSE_INT32
ifeq ($(IMPLEMENTATION),DoConcurrent)
FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
IMPLEMENTATION_OBJECT = DoConcurrentStream.o
else ifeq ($(IMPLEMENTATION),Array)
FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
IMPLEMENTATION_OBJECT = ArrayStream.o
else ifeq ($(IMPLEMENTATION),OpenMP)
FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPStream.o
else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTarget)
FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
else ifeq ($(IMPLEMENTATION),OpenACC)
FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCStream.o
else ifeq ($(IMPLEMENTATION),OpenACCArray)
FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
else ifeq ($(IMPLEMENTATION),CUDA)
FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAStream.o
else ifeq ($(IMPLEMENTATION),CUDAKernel)
FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
IMPLEMENTATION_OBJECT = CUDAKernelStream.o
else ifeq ($(IMPLEMENTATION),Sequential)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION_OBJECT = SequentialStream.o
else
$(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
IMPLEMENTATION=Sequential
IMPLEMENTATION_OBJECT = SequentialStream.o
endif
all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
$(FC) $(FCFLAGS) -c $<
%.o: %.F90 BabelStreamTypes.mod
$(FC) $(FCFLAGS) -c $<
clean:
-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
-rm -f *.modmic *.mod *.o *.cub *.ptx
realclean: clean
-rm -f BabelStream.*

View File

@ -0,0 +1,144 @@
module OpenACCArrayStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenACCArray"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$acc kernels
A = initA
B = initB
C = initC
!$acc end kernels
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$acc kernels
h_A = A
h_B = B
h_C = C
!$acc end kernels
end subroutine read_arrays
subroutine copy()
implicit none
!$acc kernels
C = A
!$acc end kernels
end subroutine copy
subroutine add()
implicit none
!$acc kernels
C = A + B
!$acc end kernels
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
B = scalar * C
!$acc end kernels
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = B + scalar * C
!$acc end kernels
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$acc kernels
A = A + B + scalar * C
!$acc end kernels
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$acc kernels
s = dot_product(A,B)
!$acc end kernels
end function dot
end module OpenACCArrayStream

View File

@ -0,0 +1,161 @@
module OpenACCStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=7), parameter :: implementation_name = "OpenACC"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use openacc
implicit none
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use openacc
implicit none
integer, intent(in) :: dev
integer :: num
num = acc_get_num_devices(acc_get_device_type())
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call acc_set_device_num(dev, acc_get_device_type())
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$acc enter data create(A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$acc exit data delete(A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$acc parallel loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$acc parallel loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$acc parallel loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenACCStream

View File

@ -0,0 +1,137 @@
module OpenMPStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=6), parameter :: implementation_name = "OpenMP"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPStream

View File

@ -0,0 +1,162 @@
module OpenMPTargetLoopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams loop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams loop
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams loop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams loop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetLoopStream

View File

@ -0,0 +1,163 @@
module OpenMPTargetStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=12), parameter :: implementation_name = "OpenMPTarget"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
use omp_lib
implicit none
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
else
write(*,'(a10,i1,a8)') "There are ",num," devices."
end if
end subroutine list_devices
subroutine set_device(dev)
use omp_lib
implicit none
integer, intent(in) :: dev
integer :: num
num = omp_get_num_devices()
if (num.eq.0) then
write(*,'(a17)') "No devices found."
stop
else if (dev.gt.num) then
write(*,'(a21)') "Invalid device index."
stop
else
call omp_set_default_device(dev)
end if
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
#ifndef USE_MANAGED
!$omp target enter data map(alloc: A,B,C)
#endif
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
#ifndef USE_MANAGED
!$omp target exit data map(delete: A,B,C)
#endif
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
! this might need to use a copy API instead...
!$omp target teams distribute parallel do simd
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i)
end do
!$omp barrier
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp target teams distribute parallel do simd
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp target teams distribute parallel do simd
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp target teams distribute parallel do simd reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module OpenMPTargetStream

View File

@ -0,0 +1,169 @@
module OpenMPTaskloopStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
!$omp end master
!$omp end parallel
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
!$omp end master
!$omp end parallel
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i)
end do
!$omp end master
!$omp end parallel
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
C(i) = A(i) + B(i)
end do
!$omp end master
!$omp end parallel
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
B(i) = scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
!$omp parallel
!$omp master
!$omp taskloop
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
!$omp end master
!$omp end parallel
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
!$omp parallel
!$omp master
!$omp taskloop reduction(+:s)
do i=1,N
s = s + A(i) * B(i)
end do
!$omp end master
!$omp end parallel
end function dot
end module OpenMPTaskloopStream

View File

@ -0,0 +1,120 @@
module OpenMPWorkshareStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
!$omp parallel workshare
A = initA
B = initB
C = initC
!$omp end parallel workshare
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
!$omp parallel workshare
h_A = A
h_B = B
h_C = C
!$omp end parallel workshare
end subroutine read_arrays
subroutine copy()
implicit none
!$omp parallel workshare
C = A
!$omp end parallel workshare
end subroutine copy
subroutine add()
implicit none
!$omp parallel workshare
C = A + B
!$omp end parallel workshare
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
B = scalar * C
!$omp end parallel workshare
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = B + scalar * C
!$omp end parallel workshare
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
scalar = startScalar
!$omp parallel workshare
A = A + B + scalar * C
!$omp end parallel workshare
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
!$omp parallel workshare
s = dot_product(A,B)
!$omp end parallel workshare
end function dot
end module OpenMPWorkshareStream

View File

@ -0,0 +1,130 @@
module SequentialStream
use, intrinsic :: ISO_Fortran_env
use BabelStreamTypes
implicit none
character(len=10), parameter :: implementation_name = "Sequential"
integer(kind=StreamIntKind) :: N
real(kind=REAL64), allocatable :: A(:), B(:), C(:)
contains
subroutine list_devices()
implicit none
integer :: num
write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
end subroutine list_devices
subroutine set_device(dev)
implicit none
integer, intent(in) :: dev
write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
end subroutine set_device
subroutine alloc(array_size)
implicit none
integer(kind=StreamIntKind) :: array_size
integer :: err
N = array_size
allocate( A(1:N), B(1:N), C(1:N), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
end subroutine alloc
subroutine dealloc()
implicit none
integer :: err
deallocate( A, B, C, stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'deallocate returned ',err
stop 1
endif
end subroutine dealloc
subroutine init_arrays(initA, initB, initC)
implicit none
real(kind=REAL64), intent(in) :: initA, initB, initC
integer(kind=StreamIntKind) :: i
do i=1,N
A(i) = initA
B(i) = initB
C(i) = initC
end do
end subroutine init_arrays
subroutine read_arrays(h_A, h_B, h_C)
implicit none
real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
integer(kind=StreamIntKind) :: i
do i=1,N
h_A(i) = A(i)
h_B(i) = B(i)
h_C(i) = C(i)
end do
end subroutine read_arrays
subroutine copy()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i)
end do
end subroutine copy
subroutine add()
implicit none
integer(kind=StreamIntKind) :: i
do i=1,N
C(i) = A(i) + B(i)
end do
end subroutine add
subroutine mul(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
B(i) = scalar * C(i)
end do
end subroutine mul
subroutine triad(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = B(i) + scalar * C(i)
end do
end subroutine triad
subroutine nstream(startScalar)
implicit none
real(kind=REAL64), intent(in) :: startScalar
real(kind=REAL64) :: scalar
integer(kind=StreamIntKind) :: i
scalar = startScalar
do i=1,N
A(i) = A(i) + B(i) + scalar * C(i)
end do
end subroutine nstream
function dot() result(s)
implicit none
real(kind=REAL64) :: s
integer(kind=StreamIntKind) :: i
s = real(0,kind=REAL64)
do i=1,N
s = s + A(i) * B(i)
end do
end function dot
end module SequentialStream

54
src/fortran/build.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# uncomment to disable GPU targets
#HAS_GPU=0
# Orin
#if [ "x${compiler}" == "xgcc" ] ; then
# export MCPU=cortex-a78ae
#fi
#if [ "x${compiler}" == "xarm" ] ; then
# export MCPU=cortex-a78
#fi
COMPILERS="gcc"
if [ $(which nvfortran) ] ; then
COMPILERS="${COMPILERS} nvhpc"
fi
if [ $(which crayftn) ] ; then
COMPILERS="${COMPILERS} cray"
fi
if [ $(uname -m) == "aarch64" ] ; then
if [ $(which armflang) ] ; then
COMPILERS="${COMPILERS} arm"
fi
if [ $(which frt) ] ; then
COMPILERS="${COMPILERS} fj"
fi
elif [ $(uname -m) == "x86_64" ] ; then
if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
COMPILERS="${COMPILERS} oneapi"
if [ -f /opt/intel/oneapi/setvars.sh ] ; then
. /opt/intel/oneapi/setvars.sh >& /dev/null
fi
else
# ^ this detection can be improved
COMPILERS="${COMPILERS} amd"
fi
fi
for compiler in ${COMPILERS} ; do
TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
if [ "${HAS_GPU}" != "0" ] ; then
TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
if [ "x${compiler}" == "xnvhpc" ] ; then
TARGETS="${TARGETS} CUDA CUDAKernel"
fi
fi
if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
TARGETS="${TARGETS} OpenACC OpenACCArray"
fi
for implementation in ${TARGETS} ; do
make COMPILER=${compiler} IMPLEMENTATION=${implementation}
done
done

666
src/fortran/main.F90 Normal file
View File

@ -0,0 +1,666 @@
module BabelStreamUtil
use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
use BabelStreamTypes
implicit none
integer(kind=StreamIntKind) :: array_size = 33554432
integer(kind=StreamIntKind) :: num_times = 100
logical :: mibibytes = .false.
logical :: use_gigs = .false.
logical :: csv = .false.
character(len=1), parameter :: csv_sep = ","
! 1 = All
! 2 = Triad
! 3 = Nstream
integer :: selection = 1
real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64)
real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64)
real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64)
real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
contains
function get_wtime() result(t)
#if defined(USE_OMP_GET_WTIME)
use omp_lib
implicit none
real(kind=REAL64) :: t
t = omp_get_wtime()
#elif defined(USE_CPU_TIME)
implicit none
real(kind=REAL64) :: t
real :: r
call cpu_time(r)
t = r
#else
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
#endif
end function get_wtime
subroutine parseArguments()
use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream, only: list_devices, set_device
#elif defined(USE_ARRAY)
use ArrayStream, only: list_devices, set_device
#elif defined(USE_OPENMP)
use OpenMPStream, only: list_devices, set_device
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream, only: list_devices, set_device
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream, only: list_devices, set_device
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream, only: list_devices, set_device
#elif defined(USE_OPENACC)
use OpenACCStream, only: list_devices, set_device
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream, only: list_devices, set_device
#elif defined(USE_CUDA)
use CUDAStream, only: list_devices, set_device
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream, only: list_devices, set_device
#elif defined(USE_SEQUENTIAL)
use SequentialStream, only: list_devices, set_device
#endif
implicit none
integer :: i, argc
integer :: arglen,err,pos(2)
character(len=64) :: argtmp
argc = command_argument_count()
do i=1,argc
call get_command_argument(i,argtmp,arglen,err)
if (err.eq.0) then
!
! list devices
!
pos(1) = index(argtmp,"--list")
if (pos(1).eq.1) then
call list_devices()
stop
endif
!
! set device number
!
pos(1) = index(argtmp,"--device")
if (pos(1).eq.1) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
stop
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer :: dev
read(argtmp,'(i15)') dev
call set_device(dev)
end block
endif
cycle
endif
!
! array size
!
pos(1) = index(argtmp,"--arraysize")
pos(2) = index(argtmp,"-s")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
block
integer(kind=INT64) :: big_size
read(argtmp,'(i15)') big_size
if (big_size .gt. HUGE(array_size)) then
print*,'Array size does not fit into integer:'
print*,big_size,'>',HUGE(array_size)
print*,'Stop using USE_INT32'
stop
else
array_size = INT(big_size,kind=StreamIntKind)
endif
end block
endif
cycle
endif
!
! number of iterations
!
pos(1) = index(argtmp,"--numtimes")
pos(2) = index(argtmp,"-n")
if (any(pos(:).eq.1) ) then
if (i+1.gt.argc) then
print*,'You failed to provide a value for ',argtmp
else
call get_command_argument(i+1,argtmp,arglen,err)
read(argtmp,'(i15)') num_times
if (num_times.lt.2) then
write(*,'(a)') "Number of times must be 2 or more"
stop
end if
endif
cycle
endif
!
! precision
!
pos(1) = index(argtmp,"--float")
if (pos(1).eq.1) then
write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
"to run BabelStream in single precision."
stop
endif
!
! selection (All, Triad, Nstream)
!
pos(1) = index(argtmp,"--triad-only")
if (pos(1).eq.1) then
selection = 2
cycle
endif
pos(1) = index(argtmp,"--nstream-only")
if (pos(1).eq.1) then
selection = 3
cycle
endif
!
! CSV
!
pos(1) = index(argtmp,"--csv")
if (pos(1).eq.1) then
csv = .true.
!write(*,'(a39)') "Sorry, CSV support isn't available yet."
!stop
endif
!
! units
!
pos(1) = index(argtmp,"--mibibytes")
if (pos(1).eq.1) then
mibibytes = .true.
cycle
endif
!
! giga/gibi instead of mega/mebi
!
pos(1) = index(argtmp,"--gigs")
if (pos(1).eq.1) then
use_gigs = .true.
cycle
endif
!
!
!
pos(1) = index(argtmp,"--compiler-info")
if (pos(1).eq.1) then
write(*,'(a)') 'Compiler version: ',compiler_version()
write(*,'(a)') 'Compiler options: ',compiler_options()
stop
endif
!
! help
!
pos(1) = index(argtmp,"--help")
pos(2) = index(argtmp,"-h")
if (any(pos(:).eq.1) ) then
call get_command_argument(0,argtmp,arglen,err)
write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
write(*,'(a)') "Options:"
write(*,'(a)') " -h --help Print the message"
write(*,'(a)') " --list List available devices"
write(*,'(a)') " --device INDEX Select device at INDEX"
write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array"
write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)"
!write(*,'(a)') " --float Use floats (rather than doubles)"
write(*,'(a)') " --triad-only Only run triad"
write(*,'(a)') " --nstream-only Only run nstream"
write(*,'(a)') " --csv Output as csv table"
write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB"
write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit."
stop
endif
end if
end do
end subroutine parseArguments
subroutine run_all(timings, summ)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64), intent(out) :: summ
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call copy()
t2 = get_wtime()
timings(1,i) = t2-t1
t1 = get_wtime()
call mul(startScalar)
t2 = get_wtime()
timings(2,i) = t2-t1
t1 = get_wtime()
call add()
t2 = get_wtime()
timings(3,i) = t2-t1
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(4,i) = t2-t1
t1 = get_wtime()
summ = dot()
t2 = get_wtime()
timings(5,i) = t2-t1
end do
end subroutine run_all
subroutine run_triad(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call triad(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_triad
subroutine run_nstream(timings)
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
real(kind=REAL64), intent(inout) :: timings(:,:)
real(kind=REAL64) :: t1, t2
integer(kind=StreamIntKind) :: i
do i=1,num_times
t1 = get_wtime()
call nstream(startScalar)
t2 = get_wtime()
timings(1,i) = t2-t1
end do
end subroutine run_nstream
subroutine check_solution(A, B, C, summ)
use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
implicit none
real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
real(kind=REAL64), intent(in) :: summ
integer(kind=StreamIntKind) :: i
real(kind=REAL64) :: goldA, goldB, goldC, goldSum
real(kind=REAL64) :: scalar
! always use double because of accumulation error
real(kind=REAL64) :: errA, errB, errC, errSum, epsi
logical :: cleanA, cleanB, cleanC, cleanSum
goldA = startA
goldB = startB
goldC = startC
goldSum = 0.0d0
scalar = startScalar
do i=1,num_times
if (selection.eq.1) then
goldC = goldA
goldB = scalar * goldC
goldC = goldA + goldB
goldA = goldB + scalar * goldC
else if (selection.eq.2) then
goldA = goldB + scalar * goldC
else if (selection.eq.3) then
goldA = goldA + goldB + scalar * goldC;
endif
end do
goldSum = goldA * goldB * array_size
cleanA = ALL(IEEE_Is_Normal(A))
cleanB = ALL(IEEE_Is_Normal(B))
cleanC = ALL(IEEE_Is_Normal(C))
cleanSum = IEEE_Is_Normal(summ)
if (.not. cleanA) then
write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanB) then
write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanC) then
write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
end if
if (.not. cleanSum) then
write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
end if
errA = SUM( ABS( A - goldA ) ) / array_size
errB = SUM( ABS( B - goldB ) ) / array_size
errC = SUM( ABS( C - goldC ) ) / array_size
errSum = ABS( (summ - goldSum) / goldSum)
epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
if (errA .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
end if
if (errB .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
end if
if (errC .gt. epsi) then
write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
end if
if (selection.eq.1) then
if (errSum .gt. 1.0e-8) then
write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
end if
endif
end subroutine check_solution
end module BabelStreamUtil
program BabelStream
use BabelStreamUtil
#if defined(USE_DOCONCURRENT)
use DoConcurrentStream
#elif defined(USE_ARRAY)
use ArrayStream
#elif defined(USE_OPENMP)
use OpenMPStream
#elif defined(USE_OPENMPWORKSHARE)
use OpenMPWorkshareStream
#elif defined(USE_OPENMPTARGET)
use OpenMPTargetStream
#elif defined(USE_OPENMPTARGETLOOP)
use OpenMPTargetLoopStream
#elif defined(USE_OPENMPTASKLOOP)
use OpenMPTaskloopStream
#elif defined(USE_OPENACC)
use OpenACCStream
#elif defined(USE_OPENACCARRAY)
use OpenACCArrayStream
#elif defined(USE_CUDA)
use CUDAStream
#elif defined(USE_CUDAKERNEL)
use CUDAKernelStream
#elif defined(USE_SEQUENTIAL)
use SequentialStream
#endif
implicit none
integer :: element_size, err
real(kind=REAL64) :: scaling
character(len=3) :: label
real(kind=REAL64), allocatable :: timings(:,:)
real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
real(kind=REAL64) :: summ
call parseArguments()
element_size = storage_size(real(0,kind=StreamRealKind)) / 8
if (mibibytes) then
if (use_gigs) then
scaling = 2.0d0**(-30)
label = "GiB"
else
scaling = 2.0d0**(-20)
label = "MiB"
endif
else
if (use_gigs) then
scaling = 1.0d-9
label = "GB"
else
scaling = 1.0d-6
label = "MB"
endif
endif
if (.not.csv) then
write(*,'(a)') "BabelStream Fortran"
write(*,'(a9,f4.1)') "Version: ", VERSION_STRING
write(*,'(a16,a)') "Implementation: ", implementation_name
block
character(len=32) :: printout
write(printout,'(i9,1x,a5)') num_times,'times'
write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
end block
write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
endif ! csv
allocate( timings(5,num_times) )
call alloc(array_size)
call init_arrays(startA, startB, startC)
summ = 0.0d0
timings = -1.0d0
if (selection.eq.1) then
call run_all(timings, summ)
else if (selection.eq.2) then
call run_triad(timings)
else if (selection.eq.3) then
call run_nstream(timings)
endif
allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
if (err .ne. 0) then
write(*,'(a20,i3)') 'allocate returned ',err
stop 1
endif
call read_arrays(h_A, h_B, h_C)
call check_solution(h_A, h_B, h_C, summ)
block
character(len=20) :: printout(8)
real(kind=REAL64) :: tmin,tmax,tavg,nbytes
if (csv) then
write(*,'(a,a1)',advance='no') 'function', csv_sep
write(*,'(a,a1)',advance='no') 'num_times', csv_sep
write(*,'(a,a1)',advance='no') 'n_elements',csv_sep
write(*,'(a,a1)',advance='no') 'sizeof', csv_sep
if (mibibytes) then
write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep
else
write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep
endif
write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep
write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep
write(*,'(a,a1)',advance='yes') 'avg_runtime'
else
write(printout(1),'(a8)') 'Function'
write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
write(printout(3),'(a9)') 'Min (sec)'
write(printout(4),'(a3)') 'Max'
write(printout(5),'(a7)') 'Average'
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif ! csv
if (selection.eq.1) then
block
integer, parameter :: sizes(5) = [2,2,3,3,2]
character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "]
integer :: i
do i=1,5
tmin = MINVAL(timings(i,2:num_times))
tmax = MAXVAL(timings(i,2:num_times))
tavg = SUM(timings(i,2:num_times)) / (num_times-1)
nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
write(printout(1),'(a)') labels(i)
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
enddo
end block
else if ((selection.eq.2).or.(selection.eq.3)) then
tmin = MINVAL(timings(1,2:num_times))
tmax = MAXVAL(timings(1,2:num_times))
tavg = SUM(timings(1,2:num_times)) / (num_times-1)
if (selection.eq.2) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 3
write(printout(1),'(a12)') "Triad"
else if (selection.eq.3) then
nbytes = element_size * REAL(array_size,kind=REAL64) * 4
write(printout(1),'(a12)') "Nstream"
endif
if (csv) then
write(printout(2),'(i20)') num_times
write(printout(3),'(i20)') array_size
write(printout(4),'(i20)') element_size
write(printout(5),'(i20)') INT(scaling*nbytes/tmin)
write(printout(6),'(f20.8)') tmin
write(printout(7),'(f20.8)') tmax
write(printout(8),'(f20.8)') tavg
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep
write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep
write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
else
write(printout(2),'(f12.3)') scaling*nbytes/tmin
write(printout(3),'(f12.5)') tmin
write(printout(4),'(f12.5)') tmax
write(printout(5),'(f12.5)') tavg
write(*,'(5a12)') ADJUSTL(printout(1:5))
endif
endif
end block
call dealloc()
end program BabelStream

25
src/fortran/make.inc.amd Normal file
View File

@ -0,0 +1,25 @@
FC := /opt/rocm/llvm/bin/flang
FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
FCFLAGS := -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
DOCONCURRENT_FLAG = -fopenmp # libomp.so required
ARRAY_FLAG = -fopenmp # libomp.so required
OPENMP_FLAG = -fopenmp
#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

39
src/fortran/make.inc.arm Normal file
View File

@ -0,0 +1,39 @@
FC = armflang
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -fopenmp
ARRAY_FLAG = -fopenmp
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

18
src/fortran/make.inc.cray Normal file
View File

@ -0,0 +1,18 @@
FC := ftn
FCFLAGS = -e F -O3
DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
ARRAY_FLAG = -h autothread
OPENMP_FLAG = -h omp
OPENACC_FLAG = -h acc
# CPU only
OPENACC_FLAG += -h omp
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

21
src/fortran/make.inc.fj Normal file
View File

@ -0,0 +1,21 @@
FC := frt
FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED
ARRAY_FLAG = -Kparallel,reduction
OPENMP_FLAG = -fopenmp
OPENACC_FLAG =
# CPU only
OPENACC_FLAG +=
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OPENACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

33
src/fortran/make.inc.gcc Normal file
View File

@ -0,0 +1,33 @@
FC = gfortran
FCFLAGS = -std=f2018 -O3
FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
ARCH=$(shell uname -m)
ifeq ($(ARCH),aarch64)
ifdef MCPU
FCFLAGS += -mcpu=$(MCPU)
else
FCFLAGS += -mcpu=native
endif
else
ifdef MARCH
FCFLAGS += -march=$(MARCH)
else
FCFLAGS += -march=native
endif
endif
DOCONCURRENT_FLAG = -ftree-parallelize-loops=4
ARRAY_FLAG =
OPENMP_FLAG = -fopenmp
OPENACC_FLAG = -fopenacc
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

View File

@ -0,0 +1,70 @@
FC := nvfortran
#FCFLAGS := -O3 -Minform=inform -Minfo=all
FCFLAGS := -O3 -Minform=warn
#TARGET=gpu
TARGET=multicore
NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
$(info Ampere detected)
GPU = cc80
endif
ifeq ($(findstring Turing,$(NVARCH)),Turing)
$(info Turing detected)
GPU = cc75
endif
ifeq ($(findstring Volta,$(NVARCH)),Volta)
$(info Volta detected)
GPU = cc70
endif
ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
$(info Pascal detected)
GPU = cc60,cc61
endif
ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1)
$(info Jetson AGX Orin detected)
GPU = ccn87,cc86
# figure out Xavier later
#GPU = cc72
endif
ifeq ($(GPU),)
$(error Your GPU architecture could not be detected. Set it manually.)
endif
GPUFLAG = -gpu=$(GPU)
# MARCH=neoverse-v1,neoverse-n1,zen3
ARCH=$(shell uname -m)
ifdef MARCH
ifeq ($(ARCH),aarch64)
ifeq ($(MARCH),neoverse-n1)
FCFLAGS += -tp=$(MARCH)
else
ifeq ($(MARCH),neoverse-v1)
FCFLAGS += -tp=$(MARCH)
else
FCFLAGS += -tp=native
endif
endif
else
FCFLAGS += -tp=$(MARCH)
endif
else
FCFLAGS += -tp=native
endif
# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
# set exactly one of these!
#MANAGED = -DUSE_MANAGED -gpu=managed
#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged
DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED)
OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED)
CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED)
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
$(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
endif

View File

@ -0,0 +1,32 @@
FC := ifx
FCFLAGS = -std18
FCFLAGS += -Ofast -xHOST
FCFLAGS += -qopt-zmm-usage=low
ifeq ($(FC),ifort)
FCFLAGS += -qopt-streaming-stores=always
PARALLEL = -parallel
endif
DOCONCURRENT_FLAG = -qopenmp $(PARALLEL)
ARRAY_FLAG = -qopenmp $(PARALLEL)
OPENMP_FLAG = -qopenmp
ifeq ($(FC),ifx)
OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
endif
OPENACC_FLAG =
CUDA_FLAG =
SEQUENTIAL_FLAG =
ifeq ($(IMPLEMENTATION),OpenACC)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),OpenACCArray)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDA)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif
ifeq ($(IMPLEMENTATION),CUDAKernels)
$(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
endif

35
src/fortran/run.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
cat ./run.sh
if [ `uname -s` == Darwin ] ; then
NUM_HWTHREADS=`sysctl -n hw.ncpu`
MEMORY_BYTES=`sysctl -n hw.memsize`
else
NUM_HWTHREADS=`nproc`
MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
fi
M=128
export OMP_NUM_THREADS=8
export OMP_PROC_BIND=close
export OMP_PLACES=threads
export ACC_NUM_CORES=${OMP_NUM_THREADS}
AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
for compiler in gcc nvhpc cray oneapi arm amd fj ; do
#if [ "x$compiler" == "xgcc" ] ; then
# export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
#fi
for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
if [ -f BabelStream.${compiler}.${implementation} ] ; then
echo "BabelStream.${compiler}.${implementation}"
ldd BabelStream.${compiler}.${implementation}
time $AFFCONTROL \
./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
fi
done
done

View File

@ -145,7 +145,7 @@ T ThrustStream<T>::dot()
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
(defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__)
#define IMPL_FN__(fn) cuda ## fn
#define IMPL_TYPE__(tpe) cuda ## tpe
#elif defined(__HIP_PLATFORM_HCC__)