diff --git a/.gitignore b/.gitignore
index 012d0e8..59ea5db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,12 +10,18 @@ sycl-stream
 hip-stream
 tbb-stream
 
+src/fortran/BabelStream
+src/fortran/BabelStream.*
+
 *.o
 *.bc
 *.sycl
 *.tar
 *.gz
 *.a
+*.mod
+*.cub
+*.ptx
 
 KokkosCore_config.*
 
diff --git a/src/.gitignore b/src/.gitignore
index 568a953..9d8b17b 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -16,6 +16,8 @@
 **/*.gz
 **/*.a
 
+**/*.swp
+
 **/KokkosCore_Config_*
 
 **/.DS_Store
@@ -26,4 +28,4 @@ cmake-build-*/
 CMakeFiles/
 .idea/
 .vscode/
-.directory
\ No newline at end of file
+.directory
diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90
new file mode 100644
index 0000000..5a8d5bc
--- /dev/null
+++ b/src/fortran/ArrayStream.F90
@@ -0,0 +1,105 @@
+module ArrayStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=5), parameter :: implementation_name = "Array"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            integer :: num
+            write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            A = initA
+            B = initB
+            C = initC
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            h_A = A
+            h_B = B
+            h_C = C
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            C = A
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            C = A + B
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            B = scalar * C
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            A = B + scalar * C
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            A = A + B + scalar * C
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            s = dot_product(A,B)
+        end function dot
+
+end module ArrayStream
diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90
new file mode 100644
index 0000000..dd01d35
--- /dev/null
+++ b/src/fortran/BabelStreamTypes.F90
@@ -0,0 +1,21 @@
+module BabelStreamTypes
+    use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32
+
+    implicit none
+
+#ifdef USE_FLOAT
+    integer, parameter :: StreamRealKind = REAL32
+    character(len=6)   :: StreamRealName = "REAL32"
+#else
+    integer, parameter :: StreamRealKind = REAL64
+    character(len=6)   :: StreamRealName = "REAL64"
+#endif
+
+#ifdef USE_INT32
+#warning There is no checking for overflowing INT32, so be careful.
+    integer, parameter :: StreamIntKind  = INT32
+#else
+    integer, parameter :: StreamIntKind  = INT64
+#endif
+
+end module BabelStreamTypes
diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90
new file mode 100644
index 0000000..01668ea
--- /dev/null
+++ b/src/fortran/CUDAKernelStream.F90
@@ -0,0 +1,230 @@
+module CUDAKernelStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=10), parameter :: implementation_name = "CUDAKernel"
+
+    integer(kind=StreamIntKind) :: N
+
+#ifdef USE_MANAGED
+    real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
+#else
+    real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
+#endif
+
+    contains
+
+        subroutine list_devices()
+            use cudafor
+            implicit none
+            integer :: num, err
+            err = cudaGetDeviceCount(num)
+            if (err.ne.0) then
+              write(*,'(a)') "cudaGetDeviceCount failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            else if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use cudafor
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num, err
+            err = cudaGetDeviceCount(num)
+            if (err.ne.0) then
+              write(*,'(a)') "cudaGetDeviceCount failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            else if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.ge.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              err = cudaSetDevice(dev)
+              if (err.ne.0) then
+                write(*,'(a)') "cudaSetDevice failed"
+                write(*,'(a)') cudaGetErrorString(err)
+                stop
+              end if
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            A = initA
+            B = initB
+            C = initC
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            h_A = A
+            h_B = B
+            h_C = C
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine read_arrays
+
+        subroutine copy()
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               C(i) = A(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine copy
+
+        subroutine add()
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine add
+
+        subroutine mul(startScalar)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            scalar = startScalar
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            scalar = startScalar
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            scalar = startScalar
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine nstream
+
+        function dot() result(r)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64) :: r
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            r = real(0,kind=REAL64)
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               r = r + A(i) * B(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end function dot
+
+end module CUDAKernelStream
diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90
new file mode 100644
index 0000000..208f1aa
--- /dev/null
+++ b/src/fortran/CUDAStream.F90
@@ -0,0 +1,309 @@
+module CUDAFortranKernels
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    contains
+
+        attributes(global) subroutine do_copy(n,A,C)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in)  :: A(n)
+            real(kind=REAL64), intent(out) :: C(n)
+            integer(kind=StreamIntKind) :: i
+            i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+            if (i <= N) then
+               C(i) = A(i)
+            endif
+        end subroutine do_copy
+
+        attributes(global) subroutine do_add(n,A,B,C)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in)  :: A(n), B(n)
+            real(kind=REAL64), intent(out) :: C(n)
+            integer(kind=StreamIntKind) :: i
+            i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+            if (i <= N) then
+               C(i) = A(i) + B(i)
+            endif
+        end subroutine do_add
+
+        attributes(global) subroutine do_mul(n,scalar,B,C)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in), value :: scalar
+            real(kind=REAL64), intent(out) :: B(n)
+            real(kind=REAL64), intent(in)  :: C(n)
+            integer(kind=StreamIntKind) :: i
+            i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+            if (i <= N) then
+               B(i) = scalar * C(i)
+            endif
+        end subroutine do_mul
+
+        attributes(global) subroutine do_triad(n,scalar,A,B,C)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in), value :: scalar
+            real(kind=REAL64), intent(out) :: A(n)
+            real(kind=REAL64), intent(in)  :: B(n), C(n)
+            integer(kind=StreamIntKind) :: i
+            i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+            if (i <= N) then
+               A(i) = B(i) + scalar * C(i)
+            endif
+        end subroutine do_triad
+
+        attributes(global) subroutine do_nstream(n,scalar,A,B,C)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in), value :: scalar
+            real(kind=REAL64), intent(inout) :: A(n)
+            real(kind=REAL64), intent(in)    :: B(n), C(n)
+            integer(kind=StreamIntKind) :: i
+            i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+            if (i <= N) then
+               A(i) = A(i) + B(i) + scalar * C(i)
+            endif
+        end subroutine do_nstream
+
+#if 0
+        attributes(global) subroutine do_dot(n,A,B,r)
+            implicit none
+            integer(kind=StreamIntKind), intent(in), value :: n
+            real(kind=REAL64), intent(in) :: A(n), B(n) 
+            real(kind=REAL64), intent(out) :: r
+            integer(kind=StreamIntKind) :: i
+            r = real(0,kind=REAL64)
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               r = r + A(i) * B(i)
+            end do
+        end subroutine do_dot
+#endif
+
+end module CUDAFortranKernels
+
+module CUDAStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+    use cudafor, only: dim3
+
+    implicit none
+
+    character(len=4), parameter :: implementation_name = "CUDA"
+
+    integer(kind=StreamIntKind) :: N
+
+#ifdef USE_MANAGED
+    real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:)
+#else
+    real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
+#endif
+
+    type(dim3) :: grid, tblock
+
+    contains
+
+        subroutine list_devices()
+            use cudafor
+            implicit none
+            integer :: num, err
+            err = cudaGetDeviceCount(num)
+            if (err.ne.0) then
+              write(*,'(a)') "cudaGetDeviceCount failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            else if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use cudafor
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num, err
+            err = cudaGetDeviceCount(num)
+            if (err.ne.0) then
+              write(*,'(a)') "cudaGetDeviceCount failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            else if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.ge.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              err = cudaSetDevice(dev)
+              if (err.ne.0) then
+                write(*,'(a)') "cudaSetDevice failed"
+                write(*,'(a)') cudaGetErrorString(err)
+                stop
+              end if
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+            ! move to separate subroutine later
+            tblock = dim3(128,1,1)
+            grid = dim3(ceiling(real(N)/tblock%x),1,1)
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            A = initA
+            B = initB
+            C = initC
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            integer :: err
+            h_A = A
+            h_B = B
+            h_C = C
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine read_arrays
+
+        subroutine copy()
+            use CUDAFortranKernels, only: do_copy
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            integer :: err
+            call do_copy<<<grid, tblock>>>(N, A, C)
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine copy
+
+        subroutine add()
+            use CUDAFortranKernels, only: do_add
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            integer :: err
+            call do_add<<<grid, tblock>>>(N, A, B, C)
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine add
+
+        subroutine mul(startScalar)
+            use CUDAFortranKernels, only: do_mul
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer :: err
+            scalar = startScalar
+            call do_mul<<<grid, tblock>>>(N, scalar, B, C)
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            use CUDAFortranKernels, only: do_triad
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer :: err
+            scalar = startScalar
+            call do_triad<<<grid, tblock>>>(N, scalar, A, B, C)
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            use CUDAFortranKernels, only: do_nstream
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer :: err
+            scalar = startScalar
+            call do_nstream<<<grid, tblock>>>(N, scalar, A, B, C)
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end subroutine nstream
+
+        function dot() result(r)
+            !use CUDAFortranKernels, only: do_dot
+            use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString
+            implicit none
+            real(kind=REAL64) :: r
+            integer :: err
+            integer(kind=StreamIntKind) :: i
+            !call do_dot<<<grid, tblock>>>(N, B, C, r)
+            r = real(0,kind=REAL64)
+            !$cuf kernel do <<< *, * >>>
+            do i=1,N
+               r = r + A(i) * B(i)
+            end do
+            err = cudaDeviceSynchronize()
+            if (err.ne.0) then
+              write(*,'(a)') "cudaDeviceSynchronize failed"
+              write(*,'(a)') cudaGetErrorString(err)
+              stop
+            endif
+        end function dot
+
+end module CUDAStream
diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90
new file mode 100644
index 0000000..781210d
--- /dev/null
+++ b/src/fortran/DoConcurrentStream.F90
@@ -0,0 +1,139 @@
+module DoConcurrentStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=12), parameter :: implementation_name = "DoConcurrent"
+
+    integer(kind=StreamIntKind) :: N
+
+#ifdef USE_DEVICE
+    real(kind=REAL64), allocatable, device :: A(:), B(:), C(:)
+#else
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+#endif
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            integer :: num
+            write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            do concurrent (i=1:N)
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            do concurrent (i=1:N) !shared(A,B,C)
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            do concurrent (i=1:N) !shared(A,C)
+               C(i) = A(i)
+            end do
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            do concurrent (i=1:N) !shared(A,B,C)
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do concurrent (i=1:N) !shared(B,C)
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do concurrent (i=1:N) !shared(A,B,C)
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do concurrent (i=1:N) !shared(A,B,C)
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            ! reduction omitted because NVF infers it and other compilers do not support
+            s = real(0,kind=REAL64)
+#ifdef CRAY_THREAD_DOCONCURRENT
+            do i=1,N
+#else
+            do concurrent (i=1:N) !shared(A,B)
+#endif
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module DoConcurrentStream
diff --git a/src/fortran/Makefile b/src/fortran/Makefile
new file mode 100644
index 0000000..18685d4
--- /dev/null
+++ b/src/fortran/Makefile
@@ -0,0 +1,109 @@
+ifeq ($(COMPILER),nvhpc)
+    include make.inc.nvhpc
+else ifeq ($(COMPILER),oneapi)
+    include make.inc.oneapi
+else ifeq ($(COMPILER),gcc)
+    include make.inc.gcc
+else ifeq ($(COMPILER),amd)
+    include make.inc.amd
+else ifeq ($(COMPILER),arm)
+    include make.inc.arm
+else ifeq ($(COMPILER),cray)
+    include make.inc.cray
+else ifeq ($(COMPILER),fj)
+    include make.inc.fj
+else
+    $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.)
+    include make.inc.gcc
+    COMPILER=gcc
+endif
+
+FCFLAGS += -DVERSION_STRING="4.0"
+#FCFLAGS += -DUSE_INT32
+
+ifeq ($(IMPLEMENTATION),DoConcurrent)
+    FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG)
+    IMPLEMENTATION_OBJECT = DoConcurrentStream.o
+
+else ifeq ($(IMPLEMENTATION),Array)
+    FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG)
+    IMPLEMENTATION_OBJECT = ArrayStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenMP)
+    FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG)
+    IMPLEMENTATION_OBJECT = OpenMPStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenMPWorkshare)
+    FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG)
+    IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenMPTarget)
+    FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG)
+    IMPLEMENTATION_OBJECT = OpenMPTargetStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop)
+    FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG)
+    IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
+    FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG)
+    IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenACC)
+    FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG)
+    IMPLEMENTATION_OBJECT = OpenACCStream.o
+
+else ifeq ($(IMPLEMENTATION),OpenACCArray)
+    FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG)
+    IMPLEMENTATION_OBJECT = OpenACCArrayStream.o
+
+else ifeq ($(IMPLEMENTATION),CUDA)
+    FCFLAGS += -DUSE_CUDA $(CUDA_FLAG)
+    IMPLEMENTATION_OBJECT = CUDAStream.o
+
+else ifeq ($(IMPLEMENTATION),CUDAKernel)
+    FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG)
+    IMPLEMENTATION_OBJECT = CUDAKernelStream.o
+
+else ifeq ($(IMPLEMENTATION),Sequential)
+    FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
+    IMPLEMENTATION_OBJECT = SequentialStream.o
+
+else
+    $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.)
+    FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG)
+    IMPLEMENTATION=Sequential
+    IMPLEMENTATION_OBJECT = SequentialStream.o
+
+endif
+
+all: BabelStream.$(COMPILER).$(IMPLEMENTATION)
+
+BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT)
+	$(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@
+
+BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90
+	$(FC) $(FCFLAGS) -c $<
+
+%.o: %.F90 BabelStreamTypes.mod
+	$(FC) $(FCFLAGS) -c $<
+
+clean:
+	-rm -f main.o BabelStreamUtil.mod babelstreamutil.mod
+	-rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod
+	-rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod
+	-rm -f ArrayStream.o ArrayStream.mod arraystream.mod
+	-rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod
+	-rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod
+	-rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod
+	-rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod
+	-rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod
+	-rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod
+	-rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod
+	-rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod
+	-rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod
+	-rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod
+	-rm -f *.modmic *.mod *.o *.cub *.ptx
+
+realclean: clean
+	-rm -f BabelStream.*
diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90
new file mode 100644
index 0000000..9225fe7
--- /dev/null
+++ b/src/fortran/OpenACCArrayStream.F90
@@ -0,0 +1,144 @@
+module OpenACCArrayStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=12), parameter :: implementation_name = "OpenACCArray"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            use openacc
+            implicit none
+            integer :: num
+            num = acc_get_num_devices(acc_get_device_type())
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use openacc
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num
+            num = acc_get_num_devices(acc_get_device_type())
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.gt.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              call acc_set_device_num(dev, acc_get_device_type())
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+#ifndef USE_MANAGED
+            !$acc enter data create(A,B,C)
+#endif
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+#ifndef USE_MANAGED
+            !$acc exit data delete(A,B,C)
+#endif
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            !$acc kernels
+            A = initA
+            B = initB
+            C = initC
+            !$acc end kernels
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            !$acc kernels
+            h_A = A
+            h_B = B
+            h_C = C
+            !$acc end kernels
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            !$acc kernels
+            C = A
+            !$acc end kernels
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            !$acc kernels
+            C = A + B
+            !$acc end kernels
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$acc kernels
+            B = scalar * C
+            !$acc end kernels
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$acc kernels
+            A = B + scalar * C
+            !$acc end kernels
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$acc kernels
+            A = A + B + scalar * C
+            !$acc end kernels
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            !$acc kernels
+            s = dot_product(A,B)
+            !$acc end kernels
+        end function dot
+
+end module OpenACCArrayStream
diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90
new file mode 100644
index 0000000..7326f38
--- /dev/null
+++ b/src/fortran/OpenACCStream.F90
@@ -0,0 +1,161 @@
+module OpenACCStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=7), parameter :: implementation_name = "OpenACC"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            use openacc
+            implicit none
+            integer :: num
+            num = acc_get_num_devices(acc_get_device_type())
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use openacc
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num
+            num = acc_get_num_devices(acc_get_device_type())
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.gt.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              call acc_set_device_num(dev, acc_get_device_type())
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+#ifndef USE_MANAGED
+            !$acc enter data create(A,B,C)
+#endif
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+#ifndef USE_MANAGED
+            !$acc exit data delete(A,B,C)
+#endif
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            !$acc parallel loop
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            !$acc parallel loop
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$acc parallel loop
+            do i=1,N
+               C(i) = A(i)
+            end do
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$acc parallel loop
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$acc parallel loop
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$acc parallel loop
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$acc parallel loop
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            !$acc parallel loop reduction(+:s)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module OpenACCStream
diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90
new file mode 100644
index 0000000..7316d5b
--- /dev/null
+++ b/src/fortran/OpenMPStream.F90
@@ -0,0 +1,137 @@
+module OpenMPStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=6), parameter :: implementation_name = "OpenMP"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel do simd
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel do simd
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel do simd
+            do i=1,N
+               C(i) = A(i)
+            end do
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel do simd
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel do simd
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel do simd
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel do simd
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            !$omp parallel do simd reduction(+:s)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module OpenMPStream
diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90
new file mode 100644
index 0000000..9684ced
--- /dev/null
+++ b/src/fortran/OpenMPTargetLoopStream.F90
@@ -0,0 +1,162 @@
+module OpenMPTargetLoopStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=16), parameter :: implementation_name = "OpenMPTargetLoop"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            use omp_lib
+            implicit none
+            integer :: num
+            num = omp_get_num_devices()
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use omp_lib
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num
+            num = omp_get_num_devices()
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.gt.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              call omp_set_default_device(dev)
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+#ifndef USE_MANAGED
+            !$omp target enter data map(alloc: A,B,C)
+#endif
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+#ifndef USE_MANAGED
+            !$omp target exit data map(delete: A,B,C)
+#endif
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams loop
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            ! this might need to use a copy API instead...
+            !$omp target teams loop
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams loop
+            do i=1,N
+               C(i) = A(i)
+            end do
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams loop
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams loop
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams loop
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams loop
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            !$omp target teams loop reduction(+:s)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module OpenMPTargetLoopStream
diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90
new file mode 100644
index 0000000..0206d78
--- /dev/null
+++ b/src/fortran/OpenMPTargetStream.F90
@@ -0,0 +1,163 @@
+module OpenMPTargetStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=12), parameter :: implementation_name = "OpenMPTarget"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            use omp_lib
+            implicit none
+            integer :: num
+            num = omp_get_num_devices()
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+            else
+              write(*,'(a10,i1,a8)') "There are ",num," devices."
+            end if
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            use omp_lib
+            implicit none
+            integer, intent(in) :: dev
+            integer :: num
+            num = omp_get_num_devices()
+            if (num.eq.0) then
+              write(*,'(a17)') "No devices found."
+              stop
+            else if (dev.gt.num) then
+              write(*,'(a21)') "Invalid device index."
+              stop
+            else
+              call omp_set_default_device(dev)
+            end if
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+#ifndef USE_MANAGED
+            !$omp target enter data map(alloc: A,B,C)
+#endif
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+#ifndef USE_MANAGED
+            !$omp target exit data map(delete: A,B,C)
+#endif
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            ! this might need to use a copy API instead...
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               C(i) = A(i)
+            end do
+            !$omp barrier
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp target teams distribute parallel do simd
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            !$omp target teams distribute parallel do simd reduction(+:s)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module OpenMPTargetStream
diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90
new file mode 100644
index 0000000..579a761
--- /dev/null
+++ b/src/fortran/OpenMPTaskloopStream.F90
@@ -0,0 +1,169 @@
+module OpenMPTaskloopStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=14), parameter :: implementation_name = "OpenMPTaskloop"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               C(i) = A(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            !$omp parallel
+            !$omp master
+            !$omp taskloop
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            !$omp parallel
+            !$omp master
+            !$omp taskloop reduction(+:s)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+            !$omp end master
+            !$omp end parallel
+        end function dot
+
+end module OpenMPTaskloopStream
diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90
new file mode 100644
index 0000000..fd50f86
--- /dev/null
+++ b/src/fortran/OpenMPWorkshareStream.F90
@@ -0,0 +1,120 @@
+module OpenMPWorkshareStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=15), parameter :: implementation_name = "OpenMPWorkshare"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            !$omp parallel workshare
+            A = initA
+            B = initB
+            C = initC
+            !$omp end parallel workshare
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            !$omp parallel workshare
+            h_A = A
+            h_B = B
+            h_C = C
+            !$omp end parallel workshare
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            !$omp parallel workshare
+            C = A
+            !$omp end parallel workshare
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            !$omp parallel workshare
+            C = A + B
+            !$omp end parallel workshare
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$omp parallel workshare
+            B = scalar * C
+            !$omp end parallel workshare
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$omp parallel workshare
+            A = B + scalar * C
+            !$omp end parallel workshare
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            scalar = startScalar
+            !$omp parallel workshare
+            A = A + B + scalar * C
+            !$omp end parallel workshare
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            !$omp parallel workshare
+            s = dot_product(A,B)
+            !$omp end parallel workshare
+        end function dot
+
+end module OpenMPWorkshareStream
diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90
new file mode 100644
index 0000000..a8f6917
--- /dev/null
+++ b/src/fortran/SequentialStream.F90
@@ -0,0 +1,130 @@
+module SequentialStream
+    use, intrinsic :: ISO_Fortran_env
+    use BabelStreamTypes
+
+    implicit none
+
+    character(len=10), parameter :: implementation_name = "Sequential"
+
+    integer(kind=StreamIntKind) :: N
+
+    real(kind=REAL64), allocatable :: A(:), B(:), C(:)
+
+    contains
+
+        subroutine list_devices()
+            implicit none
+            integer :: num
+            write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name
+        end subroutine list_devices
+
+        subroutine set_device(dev)
+            implicit none
+            integer, intent(in) :: dev
+            write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name
+        end subroutine set_device
+
+        subroutine alloc(array_size)
+            implicit none
+            integer(kind=StreamIntKind) :: array_size
+            integer :: err
+            N = array_size
+            allocate( A(1:N), B(1:N), C(1:N), stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'allocate returned ',err
+              stop 1
+            endif
+        end subroutine alloc
+
+        subroutine dealloc()
+            implicit none
+            integer :: err
+            deallocate( A, B, C, stat=err)
+            if (err .ne. 0) then
+              write(*,'(a20,i3)') 'deallocate returned ',err
+              stop 1
+            endif
+        end subroutine dealloc
+
+        subroutine init_arrays(initA, initB, initC)
+            implicit none
+            real(kind=REAL64), intent(in) :: initA, initB, initC
+            integer(kind=StreamIntKind) :: i
+            do i=1,N
+               A(i) = initA
+               B(i) = initB
+               C(i) = initC
+            end do
+        end subroutine init_arrays
+
+        subroutine read_arrays(h_A, h_B, h_C)
+            implicit none
+            real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:)
+            integer(kind=StreamIntKind) :: i
+            do i=1,N
+               h_A(i) = A(i)
+               h_B(i) = B(i)
+               h_C(i) = C(i)
+            end do
+        end subroutine read_arrays
+
+        subroutine copy()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            do i=1,N
+               C(i) = A(i)
+            end do
+        end subroutine copy
+
+        subroutine add()
+            implicit none
+            integer(kind=StreamIntKind) :: i
+            do i=1,N
+               C(i) = A(i) + B(i)
+            end do
+        end subroutine add
+
+        subroutine mul(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do i=1,N
+               B(i) = scalar * C(i)
+            end do
+        end subroutine mul
+
+        subroutine triad(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do i=1,N
+               A(i) = B(i) + scalar * C(i)
+            end do
+        end subroutine triad
+
+        subroutine nstream(startScalar)
+            implicit none
+            real(kind=REAL64), intent(in) :: startScalar
+            real(kind=REAL64) :: scalar
+            integer(kind=StreamIntKind) :: i
+            scalar = startScalar
+            do i=1,N
+               A(i) = A(i) + B(i) + scalar * C(i)
+            end do
+        end subroutine nstream
+
+        function dot() result(s)
+            implicit none
+            real(kind=REAL64) :: s
+            integer(kind=StreamIntKind) :: i
+            s = real(0,kind=REAL64)
+            do i=1,N
+               s = s + A(i) * B(i)
+            end do
+        end function dot
+
+end module SequentialStream
diff --git a/src/fortran/build.sh b/src/fortran/build.sh
new file mode 100755
index 0000000..9343354
--- /dev/null
+++ b/src/fortran/build.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# uncomment to disable GPU targets
+#HAS_GPU=0
+
+# Orin
+#if [ "x${compiler}" == "xgcc" ] ; then
+#    export MCPU=cortex-a78ae
+#fi
+#if [ "x${compiler}" == "xarm" ] ; then
+#    export MCPU=cortex-a78
+#fi
+
+COMPILERS="gcc"
+if [ $(which nvfortran) ] ; then
+    COMPILERS="${COMPILERS} nvhpc"
+fi
+if [ $(which crayftn) ] ; then
+    COMPILERS="${COMPILERS} cray"
+fi
+if [ $(uname -m) == "aarch64" ] ; then
+    if [ $(which armflang) ] ; then
+        COMPILERS="${COMPILERS} arm"
+    fi
+    if [ $(which frt) ] ; then
+        COMPILERS="${COMPILERS} fj"
+    fi
+elif [ $(uname -m) == "x86_64" ] ; then
+    if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then
+        COMPILERS="${COMPILERS} oneapi"
+        if [ -f /opt/intel/oneapi/setvars.sh ] ; then
+            . /opt/intel/oneapi/setvars.sh >& /dev/null
+        fi
+    else
+        # ^ this detection can be improved
+        COMPILERS="${COMPILERS} amd"
+    fi
+fi
+
+for compiler in ${COMPILERS} ; do
+    TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare"
+    if [ "${HAS_GPU}" != "0" ] ; then
+        TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop"
+        if [ "x${compiler}" == "xnvhpc" ] ; then
+            TARGETS="${TARGETS} CUDA CUDAKernel"
+        fi
+    fi
+    if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then
+        TARGETS="${TARGETS} OpenACC OpenACCArray"
+    fi
+    for implementation in ${TARGETS} ; do
+        make COMPILER=${compiler} IMPLEMENTATION=${implementation}
+    done
+done
diff --git a/src/fortran/main.F90 b/src/fortran/main.F90
new file mode 100644
index 0000000..d86e8d4
--- /dev/null
+++ b/src/fortran/main.F90
@@ -0,0 +1,666 @@
+module BabelStreamUtil
+    use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
+    use BabelStreamTypes
+
+    implicit none
+
+    integer(kind=StreamIntKind) :: array_size = 33554432
+    integer(kind=StreamIntKind) :: num_times  = 100
+    logical                     :: mibibytes  = .false.
+    logical                     :: use_gigs   = .false.
+    logical                     :: csv        = .false.
+    character(len=1), parameter :: csv_sep    = ","
+
+    ! 1 = All
+    ! 2 = Triad
+    ! 3 = Nstream
+    integer                     :: selection  = 1
+
+    real(kind=REAL64), parameter :: startA      = real(0.1d0,kind=REAL64)
+    real(kind=REAL64), parameter :: startB      = real(0.2d0,kind=REAL64)
+    real(kind=REAL64), parameter :: startC      = real(0.0d0,kind=REAL64)
+    real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64)
+
+    contains
+
+        function get_wtime() result(t)
+#if defined(USE_OMP_GET_WTIME)
+          use omp_lib
+          implicit none
+          real(kind=REAL64) :: t
+          t = omp_get_wtime()
+#elif  defined(USE_CPU_TIME)
+          implicit none
+          real(kind=REAL64) :: t
+          real :: r
+          call cpu_time(r)
+          t = r
+#else
+          implicit none
+          real(kind=REAL64) :: t
+          integer(kind=INT64) :: c, r
+          call system_clock(count = c, count_rate = r)
+          t = real(c,REAL64) / real(r,REAL64)
+#endif
+        end function get_wtime
+
+        subroutine parseArguments()
+            use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options
+#if defined(USE_DOCONCURRENT)
+            use DoConcurrentStream, only: list_devices, set_device
+#elif defined(USE_ARRAY)
+            use ArrayStream, only: list_devices, set_device
+#elif defined(USE_OPENMP)
+            use OpenMPStream, only: list_devices, set_device
+#elif defined(USE_OPENMPWORKSHARE)
+            use OpenMPWorkshareStream, only: list_devices, set_device
+#elif defined(USE_OPENMPTARGET)
+            use OpenMPTargetStream, only: list_devices, set_device
+#elif defined(USE_OPENMPTARGETLOOP)
+            use OpenMPTargetLoopStream, only: list_devices, set_device
+#elif defined(USE_OPENMPTASKLOOP)
+            use OpenMPTaskloopStream, only: list_devices, set_device
+#elif defined(USE_OPENACC)
+            use OpenACCStream, only: list_devices, set_device
+#elif defined(USE_OPENACCARRAY)
+            use OpenACCArrayStream, only: list_devices, set_device
+#elif defined(USE_CUDA)
+            use CUDAStream, only: list_devices, set_device
+#elif defined(USE_CUDAKERNEL)
+            use CUDAKernelStream, only: list_devices, set_device
+#elif defined(USE_SEQUENTIAL)
+            use SequentialStream, only: list_devices, set_device
+#endif
+            implicit none
+            integer :: i, argc
+            integer :: arglen,err,pos(2)
+            character(len=64) :: argtmp
+            argc = command_argument_count()
+            do i=1,argc
+                call get_command_argument(i,argtmp,arglen,err)
+                if (err.eq.0) then
+                    !
+                    ! list devices
+                    !
+                    pos(1) = index(argtmp,"--list")
+                    if (pos(1).eq.1) then
+                        call list_devices()
+                        stop
+                    endif
+                    !
+                    ! set device number
+                    !
+                    pos(1) = index(argtmp,"--device")
+                    if (pos(1).eq.1) then
+                        if (i+1.gt.argc) then
+                            print*,'You failed to provide a value for ',argtmp
+                            stop
+                        else
+                            call get_command_argument(i+1,argtmp,arglen,err)
+                            block
+                                integer :: dev
+                                read(argtmp,'(i15)') dev
+                                call set_device(dev)
+                            end block
+                        endif
+                        cycle
+                    endif
+                    !
+                    ! array size
+                    !
+                    pos(1) = index(argtmp,"--arraysize")
+                    pos(2) = index(argtmp,"-s")
+                    if (any(pos(:).eq.1) ) then
+                        if (i+1.gt.argc) then
+                            print*,'You failed to provide a value for ',argtmp
+                        else
+                            call get_command_argument(i+1,argtmp,arglen,err)
+                            block
+                              integer(kind=INT64) :: big_size
+                              read(argtmp,'(i15)') big_size
+                              if (big_size .gt. HUGE(array_size)) then
+                                print*,'Array size does not fit into integer:'
+                                print*,big_size,'>',HUGE(array_size)
+                                print*,'Stop using USE_INT32'
+                                stop
+                              else
+                                array_size = INT(big_size,kind=StreamIntKind)
+                              endif
+                            end block
+                        endif
+                        cycle
+                    endif
+                    !
+                    ! number of iterations
+                    !
+                    pos(1) = index(argtmp,"--numtimes")
+                    pos(2) = index(argtmp,"-n")
+                    if (any(pos(:).eq.1) ) then
+                        if (i+1.gt.argc) then
+                            print*,'You failed to provide a value for ',argtmp
+                        else
+                            call get_command_argument(i+1,argtmp,arglen,err)
+                            read(argtmp,'(i15)') num_times
+                            if (num_times.lt.2) then
+                                write(*,'(a)') "Number of times must be 2 or more"
+                                stop
+                            end if
+                        endif
+                        cycle
+                    endif
+                    !
+                    ! precision
+                    !
+                    pos(1) = index(argtmp,"--float")
+                    if (pos(1).eq.1) then
+                        write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", &
+                                             "to run BabelStream in single precision."
+                        stop
+                    endif
+                    !
+                    ! selection (All, Triad, Nstream)
+                    !
+                    pos(1) = index(argtmp,"--triad-only")
+                    if (pos(1).eq.1) then
+                        selection = 2
+                        cycle
+                    endif
+                    pos(1) = index(argtmp,"--nstream-only")
+                    if (pos(1).eq.1) then
+                        selection = 3
+                        cycle
+                    endif
+                    !
+                    ! CSV
+                    !
+                    pos(1) = index(argtmp,"--csv")
+                    if (pos(1).eq.1) then
+                        csv = .true.
+                        !write(*,'(a39)') "Sorry, CSV support isn't available yet."
+                        !stop
+                    endif
+                    !
+                    ! units
+                    !
+                    pos(1) = index(argtmp,"--mibibytes")
+                    if (pos(1).eq.1) then
+                        mibibytes = .true.
+                        cycle
+                    endif
+                    !
+                    ! giga/gibi instead of mega/mebi
+                    !
+                    pos(1) = index(argtmp,"--gigs")
+                    if (pos(1).eq.1) then
+                        use_gigs = .true.
+                        cycle
+                    endif
+                    !
+                    !
+                    !
+                    pos(1) = index(argtmp,"--compiler-info")
+                    if (pos(1).eq.1) then
+                        write(*,'(a)') 'Compiler version: ',compiler_version()
+                        write(*,'(a)') 'Compiler options: ',compiler_options()
+                        stop
+                    endif
+                    !
+                    ! help
+                    !
+                    pos(1) = index(argtmp,"--help")
+                    pos(2) = index(argtmp,"-h")
+                    if (any(pos(:).eq.1) ) then
+                        call get_command_argument(0,argtmp,arglen,err)
+                        write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]"
+                        write(*,'(a)') "Options:"
+                        write(*,'(a)') "  -h  --help               Print the message"
+                        write(*,'(a)') "      --list               List available devices"
+                        write(*,'(a)') "      --device     INDEX   Select device at INDEX"
+                        write(*,'(a)') "  -s  --arraysize  SIZE    Use SIZE elements in the array"
+                        write(*,'(a)') "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)"
+                        !write(*,'(a)') "      --float              Use floats (rather than doubles)"
+                        write(*,'(a)') "      --triad-only         Only run triad"
+                        write(*,'(a)') "      --nstream-only       Only run nstream"
+                        write(*,'(a)') "      --csv                Output as csv table"
+                        write(*,'(a)') "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
+                        write(*,'(a)') "      --gigs               Use GiB=2^30 or GB=10^9 instead of MiB/MB"
+                        write(*,'(a)') "      --compiler-info      Print information about compiler and flags, then exit."
+                        stop
+                    endif
+                end if
+            end do
+        end subroutine parseArguments
+
+        subroutine run_all(timings, summ)
+#if defined(USE_DOCONCURRENT)
+            use DoConcurrentStream
+#elif defined(USE_ARRAY)
+            use ArrayStream
+#elif defined(USE_OPENMP)
+            use OpenMPStream
+#elif defined(USE_OPENMPWORKSHARE)
+            use OpenMPWorkshareStream
+#elif defined(USE_OPENMPTARGET)
+            use OpenMPTargetStream
+#elif defined(USE_OPENMPTARGETLOOP)
+            use OpenMPTargetLoopStream
+#elif defined(USE_OPENMPTASKLOOP)
+            use OpenMPTaskloopStream
+#elif defined(USE_OPENACC)
+            use OpenACCStream
+#elif defined(USE_OPENACCARRAY)
+            use OpenACCArrayStream
+#elif defined(USE_CUDA)
+            use CUDAStream
+#elif defined(USE_CUDAKERNEL)
+            use CUDAKernelStream
+#elif defined(USE_SEQUENTIAL)
+            use SequentialStream
+#endif
+            implicit none
+            real(kind=REAL64), intent(inout) :: timings(:,:)
+            real(kind=REAL64), intent(out) :: summ
+            real(kind=REAL64) :: t1, t2
+            integer(kind=StreamIntKind) :: i
+
+            do i=1,num_times
+
+                t1 = get_wtime()
+                call copy()
+                t2 = get_wtime()
+                timings(1,i) = t2-t1
+
+                t1 = get_wtime()
+                call mul(startScalar)
+                t2 = get_wtime()
+                timings(2,i) = t2-t1
+
+                t1 = get_wtime()
+                call add()
+                t2 = get_wtime()
+                timings(3,i) = t2-t1
+
+                t1 = get_wtime()
+                call triad(startScalar)
+                t2 = get_wtime()
+                timings(4,i) = t2-t1
+
+                t1 = get_wtime()
+                summ = dot()
+                t2 = get_wtime()
+                timings(5,i) = t2-t1
+
+            end do
+
+        end subroutine run_all
+
+        subroutine run_triad(timings)
+#if defined(USE_DOCONCURRENT)
+            use DoConcurrentStream
+#elif defined(USE_ARRAY)
+            use ArrayStream
+#elif defined(USE_OPENMP)
+            use OpenMPStream
+#elif defined(USE_OPENMPWORKSHARE)
+            use OpenMPWorkshareStream
+#elif defined(USE_OPENMPTARGET)
+            use OpenMPTargetStream
+#elif defined(USE_OPENMPTARGETLOOP)
+            use OpenMPTargetLoopStream
+#elif defined(USE_OPENMPTASKLOOP)
+            use OpenMPTaskloopStream
+#elif defined(USE_OPENACC)
+            use OpenACCStream
+#elif defined(USE_OPENACCARRAY)
+            use OpenACCArrayStream
+#elif defined(USE_CUDA)
+            use CUDAStream
+#elif defined(USE_CUDAKERNEL)
+            use CUDAKernelStream
+#elif defined(USE_SEQUENTIAL)
+            use SequentialStream
+#endif
+            implicit none
+            real(kind=REAL64), intent(inout) :: timings(:,:)
+            real(kind=REAL64) :: t1, t2
+            integer(kind=StreamIntKind) :: i
+
+            do i=1,num_times
+
+                t1 = get_wtime()
+                call triad(startScalar)
+                t2 = get_wtime()
+                timings(1,i) = t2-t1
+
+            end do
+
+        end subroutine run_triad
+
+        subroutine run_nstream(timings)
+#if defined(USE_DOCONCURRENT)
+            use DoConcurrentStream
+#elif defined(USE_ARRAY)
+            use ArrayStream
+#elif defined(USE_OPENMP)
+            use OpenMPStream
+#elif defined(USE_OPENMPWORKSHARE)
+            use OpenMPWorkshareStream
+#elif defined(USE_OPENMPTARGET)
+            use OpenMPTargetStream
+#elif defined(USE_OPENMPTARGETLOOP)
+            use OpenMPTargetLoopStream
+#elif defined(USE_OPENMPTASKLOOP)
+            use OpenMPTaskloopStream
+#elif defined(USE_OPENACC)
+            use OpenACCStream
+#elif defined(USE_OPENACCARRAY)
+            use OpenACCArrayStream
+#elif defined(USE_CUDA)
+            use CUDAStream
+#elif defined(USE_CUDAKERNEL)
+            use CUDAKernelStream
+#elif defined(USE_SEQUENTIAL)
+            use SequentialStream
+#endif
+            implicit none
+            real(kind=REAL64), intent(inout) :: timings(:,:)
+            real(kind=REAL64) :: t1, t2
+            integer(kind=StreamIntKind) :: i
+
+            do i=1,num_times
+
+                t1 = get_wtime()
+                call nstream(startScalar)
+                t2 = get_wtime()
+                timings(1,i) = t2-t1
+
+            end do
+
+        end subroutine run_nstream
+
+        subroutine check_solution(A, B, C, summ)
+            use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal
+            implicit none
+            real(kind=REAL64), intent(in) :: A(:), B(:), C(:)
+            real(kind=REAL64), intent(in) :: summ
+
+            integer(kind=StreamIntKind) :: i
+            real(kind=REAL64) :: goldA, goldB, goldC, goldSum
+            real(kind=REAL64) :: scalar
+
+            ! always use double because of accumulation error
+            real(kind=REAL64) :: errA, errB, errC, errSum, epsi
+            logical :: cleanA, cleanB, cleanC, cleanSum
+
+            goldA = startA
+            goldB = startB
+            goldC = startC
+            goldSum = 0.0d0
+
+            scalar = startScalar
+
+            do i=1,num_times
+
+                if (selection.eq.1) then
+                    goldC = goldA
+                    goldB = scalar * goldC
+                    goldC = goldA + goldB
+                    goldA = goldB + scalar * goldC
+                else if (selection.eq.2) then
+                    goldA = goldB + scalar * goldC
+                else if (selection.eq.3) then
+                    goldA = goldA + goldB + scalar * goldC;
+                endif
+
+            end do
+
+            goldSum = goldA * goldB * array_size
+
+            cleanA = ALL(IEEE_Is_Normal(A))
+            cleanB = ALL(IEEE_Is_Normal(B))
+            cleanC = ALL(IEEE_Is_Normal(C))
+            cleanSum = IEEE_Is_Normal(summ)
+
+            if (.not. cleanA) then
+                write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal."
+            end if
+            if (.not. cleanB) then
+                write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal."
+            end if
+            if (.not. cleanC) then
+                write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal."
+            end if
+            if (.not. cleanSum) then
+                write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ
+            end if
+
+            errA = SUM( ABS( A - goldA ) ) / array_size
+            errB = SUM( ABS( B - goldB ) ) / array_size
+            errC = SUM( ABS( C - goldC ) ) / array_size
+            errSum = ABS( (summ - goldSum) /  goldSum)
+
+            epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0
+
+            if (errA .gt. epsi) then
+                write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA
+            end if
+            if (errB .gt. epsi) then
+                write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB
+            end if
+            if (errC .gt. epsi) then
+                write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC
+            end if
+
+            if (selection.eq.1) then
+                if (errSum .gt. 1.0e-8) then
+                    write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum
+                    write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum
+                end if
+            endif
+
+        end subroutine check_solution
+
+end module BabelStreamUtil
+
+program BabelStream
+    use BabelStreamUtil
+#if defined(USE_DOCONCURRENT)
+    use DoConcurrentStream
+#elif defined(USE_ARRAY)
+    use ArrayStream
+#elif defined(USE_OPENMP)
+    use OpenMPStream
+#elif defined(USE_OPENMPWORKSHARE)
+    use OpenMPWorkshareStream
+#elif defined(USE_OPENMPTARGET)
+    use OpenMPTargetStream
+#elif defined(USE_OPENMPTARGETLOOP)
+    use OpenMPTargetLoopStream
+#elif defined(USE_OPENMPTASKLOOP)
+    use OpenMPTaskloopStream
+#elif defined(USE_OPENACC)
+    use OpenACCStream
+#elif defined(USE_OPENACCARRAY)
+    use OpenACCArrayStream
+#elif defined(USE_CUDA)
+    use CUDAStream
+#elif defined(USE_CUDAKERNEL)
+    use CUDAKernelStream
+#elif defined(USE_SEQUENTIAL)
+    use SequentialStream
+#endif
+    implicit none
+    integer :: element_size, err
+    real(kind=REAL64) :: scaling
+    character(len=3) :: label
+    real(kind=REAL64), allocatable :: timings(:,:)
+    real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:)
+    real(kind=REAL64) :: summ
+
+    call parseArguments()
+
+    element_size = storage_size(real(0,kind=StreamRealKind)) / 8
+
+    if (mibibytes) then
+      if (use_gigs) then
+        scaling = 2.0d0**(-30)
+        label   = "GiB"
+      else
+        scaling = 2.0d0**(-20)
+        label   = "MiB"
+      endif
+    else
+      if (use_gigs) then
+        scaling = 1.0d-9
+        label   = "GB"
+      else
+        scaling = 1.0d-6
+        label   = "MB"
+      endif
+    endif
+
+    if (.not.csv) then
+
+      write(*,'(a)')        "BabelStream Fortran"
+      write(*,'(a9,f4.1)')  "Version: ", VERSION_STRING
+      write(*,'(a16,a)')    "Implementation: ", implementation_name
+
+      block
+        character(len=32) :: printout
+        write(printout,'(i9,1x,a5)') num_times,'times'
+        write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout)
+      end block
+      write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName)
+
+      write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label
+      write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label
+
+    endif ! csv
+
+    allocate( timings(5,num_times) )
+
+    call alloc(array_size)
+
+    call init_arrays(startA, startB, startC)
+    summ = 0.0d0
+
+    timings = -1.0d0
+    if (selection.eq.1) then
+        call run_all(timings, summ)
+    else if (selection.eq.2) then
+        call run_triad(timings)
+    else if (selection.eq.3) then
+        call run_nstream(timings)
+    endif
+
+    allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err)
+    if (err .ne. 0) then
+      write(*,'(a20,i3)') 'allocate returned ',err
+      stop 1
+    endif
+
+    call read_arrays(h_A, h_B, h_C)
+    call check_solution(h_A, h_B, h_C, summ)
+
+    block
+      character(len=20) :: printout(8)
+      real(kind=REAL64) :: tmin,tmax,tavg,nbytes
+      
+      if (csv) then
+        write(*,'(a,a1)',advance='no')  'function',  csv_sep
+        write(*,'(a,a1)',advance='no')  'num_times', csv_sep
+        write(*,'(a,a1)',advance='no')  'n_elements',csv_sep
+        write(*,'(a,a1)',advance='no')  'sizeof',    csv_sep
+        if (mibibytes) then
+          write(*,'(a,a1)',advance='no')  'max_mibytes_per_sec',csv_sep
+        else
+          write(*,'(a,a1)',advance='no')  'max_mbytes_per_sec', csv_sep
+        endif
+        write(*,'(a,a1)',advance='no')  'min_runtime',csv_sep
+        write(*,'(a,a1)',advance='no')  'max_runtime',csv_sep
+        write(*,'(a,a1)',advance='yes') 'avg_runtime'
+      else
+        write(printout(1),'(a8)')   'Function'
+        write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec'
+        write(printout(3),'(a9)')   'Min (sec)'
+        write(printout(4),'(a3)')   'Max'
+        write(printout(5),'(a7)')   'Average'
+        write(*,'(5a12)') ADJUSTL(printout(1:5))
+      endif ! csv
+    
+      if (selection.eq.1) then
+        block
+          integer, parameter :: sizes(5) = [2,2,3,3,2]
+          character(len=5), parameter :: labels(5) = ["Copy ", "Mul  ", "Add  ", "Triad", "Dot  "]
+          integer :: i
+          do i=1,5
+            tmin = MINVAL(timings(i,2:num_times))
+            tmax = MAXVAL(timings(i,2:num_times))
+            tavg = SUM(timings(i,2:num_times)) / (num_times-1)
+            nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i)
+            write(printout(1),'(a)')     labels(i)
+            if (csv) then
+              write(printout(2),'(i20)')   num_times
+              write(printout(3),'(i20)')   array_size
+              write(printout(4),'(i20)')   element_size
+              write(printout(5),'(i20)')   INT(scaling*nbytes/tmin)
+              write(printout(6),'(f20.8)') tmin
+              write(printout(7),'(f20.8)') tmax
+              write(printout(8),'(f20.8)') tavg
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(1))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(2))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(3))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(4))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(5))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(6))),csv_sep
+              write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(7))),csv_sep
+              write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
+            else
+              write(printout(2),'(f12.3)') scaling*nbytes/tmin
+              write(printout(3),'(f12.5)') tmin
+              write(printout(4),'(f12.5)') tmax
+              write(printout(5),'(f12.5)') tavg
+              write(*,'(5a12)') ADJUSTL(printout(1:5))
+            endif
+          enddo
+        end block
+      else if ((selection.eq.2).or.(selection.eq.3)) then
+        tmin = MINVAL(timings(1,2:num_times))
+        tmax = MAXVAL(timings(1,2:num_times))
+        tavg = SUM(timings(1,2:num_times)) / (num_times-1)
+        if (selection.eq.2) then
+          nbytes = element_size * REAL(array_size,kind=REAL64) * 3
+          write(printout(1),'(a12)')   "Triad"
+        else if (selection.eq.3) then
+          nbytes = element_size * REAL(array_size,kind=REAL64) * 4
+          write(printout(1),'(a12)')   "Nstream"
+        endif
+        if (csv) then
+          write(printout(2),'(i20)')   num_times
+          write(printout(3),'(i20)')   array_size
+          write(printout(4),'(i20)')   element_size
+          write(printout(5),'(i20)')   INT(scaling*nbytes/tmin)
+          write(printout(6),'(f20.8)') tmin
+          write(printout(7),'(f20.8)') tmax
+          write(printout(8),'(f20.8)') tavg
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(1))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(2))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(3))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(4))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(5))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(6))),csv_sep
+          write(*,'(a,a1)',advance='no')  TRIM(ADJUSTL(printout(7))),csv_sep
+          write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8)))
+        else
+          write(printout(2),'(f12.3)') scaling*nbytes/tmin
+          write(printout(3),'(f12.5)') tmin
+          write(printout(4),'(f12.5)') tmax
+          write(printout(5),'(f12.5)') tavg
+          write(*,'(5a12)') ADJUSTL(printout(1:5))
+        endif
+      endif
+    end block
+
+    call dealloc()
+
+end program BabelStream
diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd
new file mode 100644
index 0000000..a863de8
--- /dev/null
+++ b/src/fortran/make.inc.amd
@@ -0,0 +1,25 @@
+FC  := /opt/rocm/llvm/bin/flang
+FC	:= /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang
+FCFLAGS	:= -std=f2018 -O3
+FCFLAGS += -Wall -Wno-unused-variable
+
+ifdef MARCH
+FCFLAGS += -march=$(MARCH)
+else
+FCFLAGS += -march=native
+endif
+
+DOCONCURRENT_FLAG   = -fopenmp # libomp.so required
+ARRAY_FLAG          = -fopenmp # libomp.so required
+OPENMP_FLAG         = -fopenmp
+#OPENMP_FLAG        += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+OPENACC_FLAG        = -fopenacc
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm
new file mode 100644
index 0000000..a3e2a67
--- /dev/null
+++ b/src/fortran/make.inc.arm
@@ -0,0 +1,39 @@
+FC       = armflang
+FCFLAGS  = -std=f2018 -O3
+FCFLAGS += -Wall -Wno-unused-variable
+
+# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78
+ARCH=$(shell uname -m)
+ifeq ($(ARCH),aarch64)
+     ifdef MCPU
+         FCFLAGS += -mcpu=$(MCPU)
+     else
+         FCFLAGS += -mcpu=native
+     endif
+else
+     ifdef MARCH
+         FCFLAGS += -march=$(MARCH)
+     else
+         FCFLAGS += -march=native
+     endif
+endif
+
+DOCONCURRENT_FLAG   = -fopenmp
+ARRAY_FLAG          = -fopenmp
+OPENMP_FLAG         = -fopenmp
+OPENACC_FLAG        = -fopenacc
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),OpenACC)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),OpenACCArray)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray
new file mode 100644
index 0000000..dae4e75
--- /dev/null
+++ b/src/fortran/make.inc.cray
@@ -0,0 +1,18 @@
+FC	:= ftn
+FCFLAGS	 = -e F -O3
+
+DOCONCURRENT_FLAG   = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT
+ARRAY_FLAG          = -h autothread
+OPENMP_FLAG         = -h omp
+OPENACC_FLAG        = -h acc
+# CPU only
+OPENACC_FLAG       += -h omp
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj
new file mode 100644
index 0000000..b4761e5
--- /dev/null
+++ b/src/fortran/make.inc.fj
@@ -0,0 +1,21 @@
+FC	:= frt
+FCFLAGS	 = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution
+
+DOCONCURRENT_FLAG   = -Kparallel,reduction -DNOTSHARED
+ARRAY_FLAG          = -Kparallel,reduction
+OPENMP_FLAG         = -fopenmp
+OPENACC_FLAG        = 
+# CPU only
+OPENACC_FLAG       += 
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),OPENACC)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc
new file mode 100644
index 0000000..f59c8bb
--- /dev/null
+++ b/src/fortran/make.inc.gcc
@@ -0,0 +1,33 @@
+FC       = gfortran
+FCFLAGS  = -std=f2018 -O3
+FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable
+
+# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae
+ARCH=$(shell uname -m)
+ifeq ($(ARCH),aarch64)
+     ifdef MCPU
+         FCFLAGS += -mcpu=$(MCPU)
+     else
+         FCFLAGS += -mcpu=native
+     endif
+else
+     ifdef MARCH
+         FCFLAGS += -march=$(MARCH)
+     else
+         FCFLAGS += -march=native
+     endif
+endif
+
+DOCONCURRENT_FLAG   = -ftree-parallelize-loops=4
+ARRAY_FLAG          =
+OPENMP_FLAG         = -fopenmp
+OPENACC_FLAG        = -fopenacc
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc
new file mode 100644
index 0000000..dd4c442
--- /dev/null
+++ b/src/fortran/make.inc.nvhpc
@@ -0,0 +1,70 @@
+FC	:= nvfortran
+#FCFLAGS	:= -O3 -Minform=inform -Minfo=all
+FCFLAGS	:= -O3 -Minform=warn
+
+#TARGET=gpu
+TARGET=multicore
+
+NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture")
+ifeq ($(findstring Ampere,$(NVARCH)),Ampere)
+    $(info Ampere detected)
+    GPU = cc80
+endif
+ifeq ($(findstring Turing,$(NVARCH)),Turing)
+    $(info Turing detected)
+    GPU = cc75
+endif
+ifeq ($(findstring Volta,$(NVARCH)),Volta)
+    $(info Volta detected)
+    GPU = cc70
+endif
+ifeq ($(findstring Pascal,$(NVARCH)),Pascal)
+    $(info Pascal detected)
+    GPU = cc60,cc61
+endif
+ifeq ($(shell which jetson_clocks > /dev/null  && echo 1),1)
+    $(info Jetson AGX Orin detected)
+    GPU = ccn87,cc86
+    # figure out Xavier later
+    #GPU = cc72
+endif
+ifeq ($(GPU),)
+    $(error Your GPU architecture could not be detected. Set it manually.)
+endif
+GPUFLAG = -gpu=$(GPU)
+
+# MARCH=neoverse-v1,neoverse-n1,zen3
+ARCH=$(shell uname -m)
+ifdef MARCH
+  ifeq ($(ARCH),aarch64)
+    ifeq ($(MARCH),neoverse-n1)
+      FCFLAGS += -tp=$(MARCH)
+    else
+      ifeq ($(MARCH),neoverse-v1)
+        FCFLAGS += -tp=$(MARCH)
+      else
+        FCFLAGS += -tp=native
+      endif
+    endif
+  else
+    FCFLAGS += -tp=$(MARCH)
+  endif
+else
+  FCFLAGS += -tp=native
+endif
+
+# this is to allow apples-to-apples comparison with DC in non-DC GPU impls
+# set exactly one of these!
+#MANAGED = -DUSE_MANAGED -gpu=managed
+#DEVICE  = -DUSE_DEVICE -cuda -gpu=nomanaged
+
+DOCONCURRENT_FLAG   = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE)
+ARRAY_FLAG          = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED)
+OPENMP_FLAG         = $(GPUFLAG) -mp=$(TARGET)     $(MANAGED)
+OPENACC_FLAG        = $(GPUFLAG) -acc=$(TARGET)    $(MANAGED)
+CUDA_FLAG           = $(GPUFLAG) -cuda -acc=gpu    $(MANAGED)
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),OpenMPTaskloop)
+    $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.)
+endif
diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi
new file mode 100644
index 0000000..b7e003c
--- /dev/null
+++ b/src/fortran/make.inc.oneapi
@@ -0,0 +1,32 @@
+FC	:= ifx
+FCFLAGS	 = -std18
+FCFLAGS += -Ofast -xHOST
+FCFLAGS += -qopt-zmm-usage=low
+
+ifeq ($(FC),ifort)
+    FCFLAGS += -qopt-streaming-stores=always
+    PARALLEL = -parallel
+endif
+
+DOCONCURRENT_FLAG   = -qopenmp $(PARALLEL)
+ARRAY_FLAG          = -qopenmp $(PARALLEL)
+OPENMP_FLAG         = -qopenmp
+ifeq ($(FC),ifx)
+    OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1
+endif
+OPENACC_FLAG        =
+CUDA_FLAG           =
+SEQUENTIAL_FLAG     =
+
+ifeq ($(IMPLEMENTATION),OpenACC)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),OpenACCArray)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDA)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
+ifeq ($(IMPLEMENTATION),CUDAKernels)
+    $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.)
+endif
diff --git a/src/fortran/run.sh b/src/fortran/run.sh
new file mode 100755
index 0000000..2b41bab
--- /dev/null
+++ b/src/fortran/run.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+cat ./run.sh
+
+if [ `uname -s` == Darwin ] ; then
+    NUM_HWTHREADS=`sysctl -n hw.ncpu`
+    MEMORY_BYTES=`sysctl -n hw.memsize`
+else
+    NUM_HWTHREADS=`nproc`
+    MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'`
+fi
+
+M=128
+
+export OMP_NUM_THREADS=8
+export OMP_PROC_BIND=close
+export OMP_PLACES=threads
+
+export ACC_NUM_CORES=${OMP_NUM_THREADS}
+
+AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`"
+
+for compiler in gcc nvhpc cray oneapi arm amd fj ; do
+    #if [ "x$compiler" == "xgcc" ] ; then
+    #   export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so
+    #fi
+    for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do
+        if [ -f BabelStream.${compiler}.${implementation} ] ; then
+            echo "BabelStream.${compiler}.${implementation}"
+            ldd BabelStream.${compiler}.${implementation}
+            time $AFFCONTROL \
+            ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M}))
+        fi
+    done
+done
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 3a57ab0..f15a392 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -145,7 +145,7 @@ T ThrustStream<T>::dot()
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
     (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM)
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__)
 #define IMPL_FN__(fn) cuda ## fn
 #define IMPL_TYPE__(tpe) cuda ## tpe
 #elif defined(__HIP_PLATFORM_HCC__)