diff --git a/.gitignore b/.gitignore index 012d0e8..59ea5db 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,18 @@ sycl-stream hip-stream tbb-stream +src/fortran/BabelStream +src/fortran/BabelStream.* + *.o *.bc *.sycl *.tar *.gz *.a +*.mod +*.cub +*.ptx KokkosCore_config.* diff --git a/src/.gitignore b/src/.gitignore index 568a953..9d8b17b 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -16,6 +16,8 @@ **/*.gz **/*.a +**/*.swp + **/KokkosCore_Config_* **/.DS_Store @@ -26,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90 new file mode 100644 index 0000000..5a8d5bc --- /dev/null +++ b/src/fortran/ArrayStream.F90 @@ -0,0 +1,105 @@ +module ArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=5), parameter :: implementation_name = "Array" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + A = initA + B = initB + C = initC + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + h_A = A + h_B = B + h_C = C + end subroutine read_arrays + + subroutine copy() + implicit none + C = A + end subroutine copy + + subroutine add() + implicit none + C = A + B + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + B = scalar * C + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = B + scalar * C + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = A + B + scalar * C + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + s = dot_product(A,B) + end function dot + +end module ArrayStream diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90 new file mode 100644 index 0000000..dd01d35 --- /dev/null +++ b/src/fortran/BabelStreamTypes.F90 @@ -0,0 +1,21 @@ +module BabelStreamTypes + use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 + + implicit none + +#ifdef USE_FLOAT + integer, parameter :: StreamRealKind = REAL32 + character(len=6) :: StreamRealName = "REAL32" +#else + integer, parameter :: StreamRealKind = REAL64 + character(len=6) :: StreamRealName = "REAL64" +#endif + +#ifdef USE_INT32 +#warning There is no checking for overflowing INT32, so be careful. + integer, parameter :: StreamIntKind = INT32 +#else + integer, parameter :: StreamIntKind = INT64 +#endif + +end module BabelStreamTypes diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90 new file mode 100644 index 0000000..01668ea --- /dev/null +++ b/src/fortran/CUDAKernelStream.F90 @@ -0,0 +1,230 @@ +module CUDAKernelStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "CUDAKernel" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + B(i) = scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer(kind=StreamIntKind) :: i + integer :: err + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAKernelStream diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90 new file mode 100644 index 0000000..208f1aa --- /dev/null +++ b/src/fortran/CUDAStream.F90 @@ -0,0 +1,309 @@ +module CUDAFortranKernels + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + contains + + attributes(global) subroutine do_copy(n,A,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + endif + end subroutine do_copy + + attributes(global) subroutine do_add(n,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + B(i) + endif + end subroutine do_add + + attributes(global) subroutine do_mul(n,scalar,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: B(n) + real(kind=REAL64), intent(in) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + B(i) = scalar * C(i) + endif + end subroutine do_mul + + attributes(global) subroutine do_triad(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = B(i) + scalar * C(i) + endif + end subroutine do_triad + + attributes(global) subroutine do_nstream(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine do_nstream + +#if 0 + attributes(global) subroutine do_dot(n,A,B,r) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: r + integer(kind=StreamIntKind) :: i + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + end subroutine do_dot +#endif + +end module CUDAFortranKernels + +module CUDAStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + use cudafor, only: dim3 + + implicit none + + character(len=4), parameter :: implementation_name = "CUDA" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + type(dim3) :: grid, tblock + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + ! move to separate subroutine later + tblock = dim3(128,1,1) + grid = dim3(ceiling(real(N)/tblock%x),1,1) + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use CUDAFortranKernels, only: do_copy + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_copy<<>>(N, A, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use CUDAFortranKernels, only: do_add + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_add<<>>(N, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use CUDAFortranKernels, only: do_mul + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_mul<<>>(N, scalar, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use CUDAFortranKernels, only: do_triad + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_triad<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use CUDAFortranKernels, only: do_nstream + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_nstream<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + !use CUDAFortranKernels, only: do_dot + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer :: err + integer(kind=StreamIntKind) :: i + !call do_dot<<>>(N, B, C, r) + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAStream diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90 new file mode 100644 index 0000000..781210d --- /dev/null +++ b/src/fortran/DoConcurrentStream.F90 @@ -0,0 +1,139 @@ +module DoConcurrentStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "DoConcurrent" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_DEVICE + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,C) + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(B,C) + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + ! reduction omitted because NVF infers it and other compilers do not support + s = real(0,kind=REAL64) +#ifdef CRAY_THREAD_DOCONCURRENT + do i=1,N +#else + do concurrent (i=1:N) !shared(A,B) +#endif + s = s + A(i) * B(i) + end do + end function dot + +end module DoConcurrentStream diff --git a/src/fortran/Makefile b/src/fortran/Makefile new file mode 100644 index 0000000..18685d4 --- /dev/null +++ b/src/fortran/Makefile @@ -0,0 +1,109 @@ +ifeq ($(COMPILER),nvhpc) + include make.inc.nvhpc +else ifeq ($(COMPILER),oneapi) + include make.inc.oneapi +else ifeq ($(COMPILER),gcc) + include make.inc.gcc +else ifeq ($(COMPILER),amd) + include make.inc.amd +else ifeq ($(COMPILER),arm) + include make.inc.arm +else ifeq ($(COMPILER),cray) + include make.inc.cray +else ifeq ($(COMPILER),fj) + include make.inc.fj +else + $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.) + include make.inc.gcc + COMPILER=gcc +endif + +FCFLAGS += -DVERSION_STRING="4.0" +#FCFLAGS += -DUSE_INT32 + +ifeq ($(IMPLEMENTATION),DoConcurrent) + FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG) + IMPLEMENTATION_OBJECT = DoConcurrentStream.o + +else ifeq ($(IMPLEMENTATION),Array) + FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG) + IMPLEMENTATION_OBJECT = ArrayStream.o + +else ifeq ($(IMPLEMENTATION),OpenMP) + FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPWorkshare) + FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTarget) + FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop) + FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o + +else ifeq ($(IMPLEMENTATION),OpenACC) + FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCStream.o + +else ifeq ($(IMPLEMENTATION),OpenACCArray) + FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCArrayStream.o + +else ifeq ($(IMPLEMENTATION),CUDA) + FCFLAGS += -DUSE_CUDA $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAStream.o + +else ifeq ($(IMPLEMENTATION),CUDAKernel) + FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAKernelStream.o + +else ifeq ($(IMPLEMENTATION),Sequential) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION_OBJECT = SequentialStream.o + +else + $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION=Sequential + IMPLEMENTATION_OBJECT = SequentialStream.o + +endif + +all: BabelStream.$(COMPILER).$(IMPLEMENTATION) + +BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT) + $(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@ + +BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90 + $(FC) $(FCFLAGS) -c $< + +%.o: %.F90 BabelStreamTypes.mod + $(FC) $(FCFLAGS) -c $< + +clean: + -rm -f main.o BabelStreamUtil.mod babelstreamutil.mod + -rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod + -rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod + -rm -f ArrayStream.o ArrayStream.mod arraystream.mod + -rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod + -rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod + -rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod + -rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod + -rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod + -rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod + -rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod + -rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod + -rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod + -rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod + -rm -f *.modmic *.mod *.o *.cub *.ptx + +realclean: clean + -rm -f BabelStream.* diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90 new file mode 100644 index 0000000..9225fe7 --- /dev/null +++ b/src/fortran/OpenACCArrayStream.F90 @@ -0,0 +1,144 @@ +module OpenACCArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenACCArray" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$acc kernels + A = initA + B = initB + C = initC + !$acc end kernels + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$acc kernels + h_A = A + h_B = B + h_C = C + !$acc end kernels + end subroutine read_arrays + + subroutine copy() + implicit none + !$acc kernels + C = A + !$acc end kernels + end subroutine copy + + subroutine add() + implicit none + !$acc kernels + C = A + B + !$acc end kernels + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + B = scalar * C + !$acc end kernels + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = B + scalar * C + !$acc end kernels + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = A + B + scalar * C + !$acc end kernels + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$acc kernels + s = dot_product(A,B) + !$acc end kernels + end function dot + +end module OpenACCArrayStream diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90 new file mode 100644 index 0000000..7326f38 --- /dev/null +++ b/src/fortran/OpenACCStream.F90 @@ -0,0 +1,161 @@ +module OpenACCStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=7), parameter :: implementation_name = "OpenACC" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$acc parallel loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenACCStream diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90 new file mode 100644 index 0000000..7316d5b --- /dev/null +++ b/src/fortran/OpenMPStream.F90 @@ -0,0 +1,137 @@ +module OpenMPStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=6), parameter :: implementation_name = "OpenMP" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPStream diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90 new file mode 100644 index 0000000..9684ced --- /dev/null +++ b/src/fortran/OpenMPTargetLoopStream.F90 @@ -0,0 +1,162 @@ +module OpenMPTargetLoopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=16), parameter :: implementation_name = "OpenMPTargetLoop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetLoopStream diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90 new file mode 100644 index 0000000..0206d78 --- /dev/null +++ b/src/fortran/OpenMPTargetStream.F90 @@ -0,0 +1,163 @@ +module OpenMPTargetStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenMPTarget" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams distribute parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + end do + !$omp barrier + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams distribute parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetStream diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90 new file mode 100644 index 0000000..579a761 --- /dev/null +++ b/src/fortran/OpenMPTaskloopStream.F90 @@ -0,0 +1,169 @@ +module OpenMPTaskloopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=14), parameter :: implementation_name = "OpenMPTaskloop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + !$omp end master + !$omp end parallel + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + !$omp end master + !$omp end parallel + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + end do + !$omp end master + !$omp end parallel + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + B(i) + end do + !$omp end master + !$omp end parallel + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + B(i) = scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel + !$omp master + !$omp taskloop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + !$omp end master + !$omp end parallel + end function dot + +end module OpenMPTaskloopStream diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90 new file mode 100644 index 0000000..fd50f86 --- /dev/null +++ b/src/fortran/OpenMPWorkshareStream.F90 @@ -0,0 +1,120 @@ +module OpenMPWorkshareStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=15), parameter :: implementation_name = "OpenMPWorkshare" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$omp parallel workshare + A = initA + B = initB + C = initC + !$omp end parallel workshare + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$omp parallel workshare + h_A = A + h_B = B + h_C = C + !$omp end parallel workshare + end subroutine read_arrays + + subroutine copy() + implicit none + !$omp parallel workshare + C = A + !$omp end parallel workshare + end subroutine copy + + subroutine add() + implicit none + !$omp parallel workshare + C = A + B + !$omp end parallel workshare + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + B = scalar * C + !$omp end parallel workshare + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = B + scalar * C + !$omp end parallel workshare + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = A + B + scalar * C + !$omp end parallel workshare + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$omp parallel workshare + s = dot_product(A,B) + !$omp end parallel workshare + end function dot + +end module OpenMPWorkshareStream diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90 new file mode 100644 index 0000000..a8f6917 --- /dev/null +++ b/src/fortran/SequentialStream.F90 @@ -0,0 +1,130 @@ +module SequentialStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "Sequential" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module SequentialStream diff --git a/src/fortran/build.sh b/src/fortran/build.sh new file mode 100755 index 0000000..9343354 --- /dev/null +++ b/src/fortran/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# uncomment to disable GPU targets +#HAS_GPU=0 + +# Orin +#if [ "x${compiler}" == "xgcc" ] ; then +# export MCPU=cortex-a78ae +#fi +#if [ "x${compiler}" == "xarm" ] ; then +# export MCPU=cortex-a78 +#fi + +COMPILERS="gcc" +if [ $(which nvfortran) ] ; then + COMPILERS="${COMPILERS} nvhpc" +fi +if [ $(which crayftn) ] ; then + COMPILERS="${COMPILERS} cray" +fi +if [ $(uname -m) == "aarch64" ] ; then + if [ $(which armflang) ] ; then + COMPILERS="${COMPILERS} arm" + fi + if [ $(which frt) ] ; then + COMPILERS="${COMPILERS} fj" + fi +elif [ $(uname -m) == "x86_64" ] ; then + if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then + COMPILERS="${COMPILERS} oneapi" + if [ -f /opt/intel/oneapi/setvars.sh ] ; then + . /opt/intel/oneapi/setvars.sh >& /dev/null + fi + else + # ^ this detection can be improved + COMPILERS="${COMPILERS} amd" + fi +fi + +for compiler in ${COMPILERS} ; do + TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" + if [ "${HAS_GPU}" != "0" ] ; then + TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" + if [ "x${compiler}" == "xnvhpc" ] ; then + TARGETS="${TARGETS} CUDA CUDAKernel" + fi + fi + if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then + TARGETS="${TARGETS} OpenACC OpenACCArray" + fi + for implementation in ${TARGETS} ; do + make COMPILER=${compiler} IMPLEMENTATION=${implementation} + done +done diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 new file mode 100644 index 0000000..d86e8d4 --- /dev/null +++ b/src/fortran/main.F90 @@ -0,0 +1,666 @@ +module BabelStreamUtil + use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 + use BabelStreamTypes + + implicit none + + integer(kind=StreamIntKind) :: array_size = 33554432 + integer(kind=StreamIntKind) :: num_times = 100 + logical :: mibibytes = .false. + logical :: use_gigs = .false. + logical :: csv = .false. + character(len=1), parameter :: csv_sep = "," + + ! 1 = All + ! 2 = Triad + ! 3 = Nstream + integer :: selection = 1 + + real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64) + real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64) + real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64) + real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64) + + contains + + function get_wtime() result(t) +#if defined(USE_OMP_GET_WTIME) + use omp_lib + implicit none + real(kind=REAL64) :: t + t = omp_get_wtime() +#elif defined(USE_CPU_TIME) + implicit none + real(kind=REAL64) :: t + real :: r + call cpu_time(r) + t = r +#else + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +#endif + end function get_wtime + + subroutine parseArguments() + use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream, only: list_devices, set_device +#elif defined(USE_ARRAY) + use ArrayStream, only: list_devices, set_device +#elif defined(USE_OPENMP) + use OpenMPStream, only: list_devices, set_device +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream, only: list_devices, set_device +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream, only: list_devices, set_device +#elif defined(USE_OPENACC) + use OpenACCStream, only: list_devices, set_device +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream, only: list_devices, set_device +#elif defined(USE_CUDA) + use CUDAStream, only: list_devices, set_device +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream, only: list_devices, set_device +#elif defined(USE_SEQUENTIAL) + use SequentialStream, only: list_devices, set_device +#endif + implicit none + integer :: i, argc + integer :: arglen,err,pos(2) + character(len=64) :: argtmp + argc = command_argument_count() + do i=1,argc + call get_command_argument(i,argtmp,arglen,err) + if (err.eq.0) then + ! + ! list devices + ! + pos(1) = index(argtmp,"--list") + if (pos(1).eq.1) then + call list_devices() + stop + endif + ! + ! set device number + ! + pos(1) = index(argtmp,"--device") + if (pos(1).eq.1) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + stop + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer :: dev + read(argtmp,'(i15)') dev + call set_device(dev) + end block + endif + cycle + endif + ! + ! array size + ! + pos(1) = index(argtmp,"--arraysize") + pos(2) = index(argtmp,"-s") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer(kind=INT64) :: big_size + read(argtmp,'(i15)') big_size + if (big_size .gt. HUGE(array_size)) then + print*,'Array size does not fit into integer:' + print*,big_size,'>',HUGE(array_size) + print*,'Stop using USE_INT32' + stop + else + array_size = INT(big_size,kind=StreamIntKind) + endif + end block + endif + cycle + endif + ! + ! number of iterations + ! + pos(1) = index(argtmp,"--numtimes") + pos(2) = index(argtmp,"-n") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + read(argtmp,'(i15)') num_times + if (num_times.lt.2) then + write(*,'(a)') "Number of times must be 2 or more" + stop + end if + endif + cycle + endif + ! + ! precision + ! + pos(1) = index(argtmp,"--float") + if (pos(1).eq.1) then + write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", & + "to run BabelStream in single precision." + stop + endif + ! + ! selection (All, Triad, Nstream) + ! + pos(1) = index(argtmp,"--triad-only") + if (pos(1).eq.1) then + selection = 2 + cycle + endif + pos(1) = index(argtmp,"--nstream-only") + if (pos(1).eq.1) then + selection = 3 + cycle + endif + ! + ! CSV + ! + pos(1) = index(argtmp,"--csv") + if (pos(1).eq.1) then + csv = .true. + !write(*,'(a39)') "Sorry, CSV support isn't available yet." + !stop + endif + ! + ! units + ! + pos(1) = index(argtmp,"--mibibytes") + if (pos(1).eq.1) then + mibibytes = .true. + cycle + endif + ! + ! giga/gibi instead of mega/mebi + ! + pos(1) = index(argtmp,"--gigs") + if (pos(1).eq.1) then + use_gigs = .true. + cycle + endif + ! + ! + ! + pos(1) = index(argtmp,"--compiler-info") + if (pos(1).eq.1) then + write(*,'(a)') 'Compiler version: ',compiler_version() + write(*,'(a)') 'Compiler options: ',compiler_options() + stop + endif + ! + ! help + ! + pos(1) = index(argtmp,"--help") + pos(2) = index(argtmp,"-h") + if (any(pos(:).eq.1) ) then + call get_command_argument(0,argtmp,arglen,err) + write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]" + write(*,'(a)') "Options:" + write(*,'(a)') " -h --help Print the message" + write(*,'(a)') " --list List available devices" + write(*,'(a)') " --device INDEX Select device at INDEX" + write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array" + write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)" + !write(*,'(a)') " --float Use floats (rather than doubles)" + write(*,'(a)') " --triad-only Only run triad" + write(*,'(a)') " --nstream-only Only run nstream" + write(*,'(a)') " --csv Output as csv table" + write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB" + write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit." + stop + endif + end if + end do + end subroutine parseArguments + + subroutine run_all(timings, summ) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64), intent(out) :: summ + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call copy() + t2 = get_wtime() + timings(1,i) = t2-t1 + + t1 = get_wtime() + call mul(startScalar) + t2 = get_wtime() + timings(2,i) = t2-t1 + + t1 = get_wtime() + call add() + t2 = get_wtime() + timings(3,i) = t2-t1 + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(4,i) = t2-t1 + + t1 = get_wtime() + summ = dot() + t2 = get_wtime() + timings(5,i) = t2-t1 + + end do + + end subroutine run_all + + subroutine run_triad(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_triad + + subroutine run_nstream(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call nstream(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_nstream + + subroutine check_solution(A, B, C, summ) + use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal + implicit none + real(kind=REAL64), intent(in) :: A(:), B(:), C(:) + real(kind=REAL64), intent(in) :: summ + + integer(kind=StreamIntKind) :: i + real(kind=REAL64) :: goldA, goldB, goldC, goldSum + real(kind=REAL64) :: scalar + + ! always use double because of accumulation error + real(kind=REAL64) :: errA, errB, errC, errSum, epsi + logical :: cleanA, cleanB, cleanC, cleanSum + + goldA = startA + goldB = startB + goldC = startC + goldSum = 0.0d0 + + scalar = startScalar + + do i=1,num_times + + if (selection.eq.1) then + goldC = goldA + goldB = scalar * goldC + goldC = goldA + goldB + goldA = goldB + scalar * goldC + else if (selection.eq.2) then + goldA = goldB + scalar * goldC + else if (selection.eq.3) then + goldA = goldA + goldB + scalar * goldC; + endif + + end do + + goldSum = goldA * goldB * array_size + + cleanA = ALL(IEEE_Is_Normal(A)) + cleanB = ALL(IEEE_Is_Normal(B)) + cleanC = ALL(IEEE_Is_Normal(C)) + cleanSum = IEEE_Is_Normal(summ) + + if (.not. cleanA) then + write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanB) then + write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanC) then + write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanSum) then + write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ + end if + + errA = SUM( ABS( A - goldA ) ) / array_size + errB = SUM( ABS( B - goldB ) ) / array_size + errC = SUM( ABS( C - goldC ) ) / array_size + errSum = ABS( (summ - goldSum) / goldSum) + + epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0 + + if (errA .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA + end if + if (errB .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB + end if + if (errC .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC + end if + + if (selection.eq.1) then + if (errSum .gt. 1.0e-8) then + write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum + write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum + end if + endif + + end subroutine check_solution + +end module BabelStreamUtil + +program BabelStream + use BabelStreamUtil +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + integer :: element_size, err + real(kind=REAL64) :: scaling + character(len=3) :: label + real(kind=REAL64), allocatable :: timings(:,:) + real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) + real(kind=REAL64) :: summ + + call parseArguments() + + element_size = storage_size(real(0,kind=StreamRealKind)) / 8 + + if (mibibytes) then + if (use_gigs) then + scaling = 2.0d0**(-30) + label = "GiB" + else + scaling = 2.0d0**(-20) + label = "MiB" + endif + else + if (use_gigs) then + scaling = 1.0d-9 + label = "GB" + else + scaling = 1.0d-6 + label = "MB" + endif + endif + + if (.not.csv) then + + write(*,'(a)') "BabelStream Fortran" + write(*,'(a9,f4.1)') "Version: ", VERSION_STRING + write(*,'(a16,a)') "Implementation: ", implementation_name + + block + character(len=32) :: printout + write(printout,'(i9,1x,a5)') num_times,'times' + write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout) + end block + write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName) + + write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label + write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label + + endif ! csv + + allocate( timings(5,num_times) ) + + call alloc(array_size) + + call init_arrays(startA, startB, startC) + summ = 0.0d0 + + timings = -1.0d0 + if (selection.eq.1) then + call run_all(timings, summ) + else if (selection.eq.2) then + call run_triad(timings) + else if (selection.eq.3) then + call run_nstream(timings) + endif + + allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + + call read_arrays(h_A, h_B, h_C) + call check_solution(h_A, h_B, h_C, summ) + + block + character(len=20) :: printout(8) + real(kind=REAL64) :: tmin,tmax,tavg,nbytes + + if (csv) then + write(*,'(a,a1)',advance='no') 'function', csv_sep + write(*,'(a,a1)',advance='no') 'num_times', csv_sep + write(*,'(a,a1)',advance='no') 'n_elements',csv_sep + write(*,'(a,a1)',advance='no') 'sizeof', csv_sep + if (mibibytes) then + write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep + else + write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep + endif + write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep + write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep + write(*,'(a,a1)',advance='yes') 'avg_runtime' + else + write(printout(1),'(a8)') 'Function' + write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec' + write(printout(3),'(a9)') 'Min (sec)' + write(printout(4),'(a3)') 'Max' + write(printout(5),'(a7)') 'Average' + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif ! csv + + if (selection.eq.1) then + block + integer, parameter :: sizes(5) = [2,2,3,3,2] + character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "] + integer :: i + do i=1,5 + tmin = MINVAL(timings(i,2:num_times)) + tmax = MAXVAL(timings(i,2:num_times)) + tavg = SUM(timings(i,2:num_times)) / (num_times-1) + nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i) + write(printout(1),'(a)') labels(i) + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + enddo + end block + else if ((selection.eq.2).or.(selection.eq.3)) then + tmin = MINVAL(timings(1,2:num_times)) + tmax = MAXVAL(timings(1,2:num_times)) + tavg = SUM(timings(1,2:num_times)) / (num_times-1) + if (selection.eq.2) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 3 + write(printout(1),'(a12)') "Triad" + else if (selection.eq.3) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 4 + write(printout(1),'(a12)') "Nstream" + endif + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + endif + end block + + call dealloc() + +end program BabelStream diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd new file mode 100644 index 0000000..a863de8 --- /dev/null +++ b/src/fortran/make.inc.amd @@ -0,0 +1,25 @@ +FC := /opt/rocm/llvm/bin/flang +FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang +FCFLAGS := -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +ifdef MARCH +FCFLAGS += -march=$(MARCH) +else +FCFLAGS += -march=native +endif + +DOCONCURRENT_FLAG = -fopenmp # libomp.so required +ARRAY_FLAG = -fopenmp # libomp.so required +OPENMP_FLAG = -fopenmp +#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm new file mode 100644 index 0000000..a3e2a67 --- /dev/null +++ b/src/fortran/make.inc.arm @@ -0,0 +1,39 @@ +FC = armflang +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -fopenmp +ARRAY_FLAG = -fopenmp +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray new file mode 100644 index 0000000..dae4e75 --- /dev/null +++ b/src/fortran/make.inc.cray @@ -0,0 +1,18 @@ +FC := ftn +FCFLAGS = -e F -O3 + +DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT +ARRAY_FLAG = -h autothread +OPENMP_FLAG = -h omp +OPENACC_FLAG = -h acc +# CPU only +OPENACC_FLAG += -h omp +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj new file mode 100644 index 0000000..b4761e5 --- /dev/null +++ b/src/fortran/make.inc.fj @@ -0,0 +1,21 @@ +FC := frt +FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution + +DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED +ARRAY_FLAG = -Kparallel,reduction +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = +# CPU only +OPENACC_FLAG += +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OPENACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc new file mode 100644 index 0000000..f59c8bb --- /dev/null +++ b/src/fortran/make.inc.gcc @@ -0,0 +1,33 @@ +FC = gfortran +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 +ARRAY_FLAG = +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc new file mode 100644 index 0000000..dd4c442 --- /dev/null +++ b/src/fortran/make.inc.nvhpc @@ -0,0 +1,70 @@ +FC := nvfortran +#FCFLAGS := -O3 -Minform=inform -Minfo=all +FCFLAGS := -O3 -Minform=warn + +#TARGET=gpu +TARGET=multicore + +NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") +ifeq ($(findstring Ampere,$(NVARCH)),Ampere) + $(info Ampere detected) + GPU = cc80 +endif +ifeq ($(findstring Turing,$(NVARCH)),Turing) + $(info Turing detected) + GPU = cc75 +endif +ifeq ($(findstring Volta,$(NVARCH)),Volta) + $(info Volta detected) + GPU = cc70 +endif +ifeq ($(findstring Pascal,$(NVARCH)),Pascal) + $(info Pascal detected) + GPU = cc60,cc61 +endif +ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) + $(info Jetson AGX Orin detected) + GPU = ccn87,cc86 + # figure out Xavier later + #GPU = cc72 +endif +ifeq ($(GPU),) + $(error Your GPU architecture could not be detected. Set it manually.) +endif +GPUFLAG = -gpu=$(GPU) + +# MARCH=neoverse-v1,neoverse-n1,zen3 +ARCH=$(shell uname -m) +ifdef MARCH + ifeq ($(ARCH),aarch64) + ifeq ($(MARCH),neoverse-n1) + FCFLAGS += -tp=$(MARCH) + else + ifeq ($(MARCH),neoverse-v1) + FCFLAGS += -tp=$(MARCH) + else + FCFLAGS += -tp=native + endif + endif + else + FCFLAGS += -tp=$(MARCH) + endif +else + FCFLAGS += -tp=native +endif + +# this is to allow apples-to-apples comparison with DC in non-DC GPU impls +# set exactly one of these! +#MANAGED = -DUSE_MANAGED -gpu=managed +#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged + +DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) +ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) +OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) +OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) +CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi new file mode 100644 index 0000000..b7e003c --- /dev/null +++ b/src/fortran/make.inc.oneapi @@ -0,0 +1,32 @@ +FC := ifx +FCFLAGS = -std18 +FCFLAGS += -Ofast -xHOST +FCFLAGS += -qopt-zmm-usage=low + +ifeq ($(FC),ifort) + FCFLAGS += -qopt-streaming-stores=always + PARALLEL = -parallel +endif + +DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) +ARRAY_FLAG = -qopenmp $(PARALLEL) +OPENMP_FLAG = -qopenmp +ifeq ($(FC),ifx) + OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 +endif +OPENACC_FLAG = +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/run.sh b/src/fortran/run.sh new file mode 100755 index 0000000..2b41bab --- /dev/null +++ b/src/fortran/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +cat ./run.sh + +if [ `uname -s` == Darwin ] ; then + NUM_HWTHREADS=`sysctl -n hw.ncpu` + MEMORY_BYTES=`sysctl -n hw.memsize` +else + NUM_HWTHREADS=`nproc` + MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` +fi + +M=128 + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=close +export OMP_PLACES=threads + +export ACC_NUM_CORES=${OMP_NUM_THREADS} + +AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" + +for compiler in gcc nvhpc cray oneapi arm amd fj ; do + #if [ "x$compiler" == "xgcc" ] ; then + # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so + #fi + for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do + if [ -f BabelStream.${compiler}.${implementation} ] ; then + echo "BabelStream.${compiler}.${implementation}" + ldd BabelStream.${compiler}.${implementation} + time $AFFCONTROL \ + ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) + fi + done +done diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 3a57ab0..f15a392 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -145,7 +145,7 @@ T ThrustStream::dot() #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__) #define IMPL_FN__(fn) cuda ## fn #define IMPL_TYPE__(tpe) cuda ## tpe #elif defined(__HIP_PLATFORM_HCC__)