diff --git a/.gitignore b/.gitignore index 012d0e8..59ea5db 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,18 @@ sycl-stream hip-stream tbb-stream +src/fortran/BabelStream +src/fortran/BabelStream.* + *.o *.bc *.sycl *.tar *.gz *.a +*.mod +*.cub +*.ptx KokkosCore_config.* diff --git a/CHANGELOG.md b/CHANGELOG.md index cc135f9..3b8aa1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,16 @@ All notable changes to this project will be documented in this file. ## Unreleased +### Added +- Ability to build Kokkos and RAJA versions against existing packages. + +### Changed +- RAJA CUDA CMake build issues resolved. - Fix CUDA memory limit check. - Use long double for `check_solution` in case of large problem size. +- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. +- Updates to the HIP kernels and API usage. +- Number of thread-blocks in CUDA dot kernel implementation changed to 1024. ## [v4.0] - 2021-12-22 diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c13746..4321e86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,7 @@ message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_F # setup common build flag defaults if there are no overrides if (NOT DEFINED ${BUILD_TYPE}_FLAGS) set(ACTUAL_${BUILD_TYPE}_FLAGS ${DEFAULT_${BUILD_TYPE}_FLAGS}) -elseif () +else () set(ACTUAL_${BUILD_TYPE}_FLAGS ${${BUILD_TYPE}_FLAGS}) endif () diff --git a/README.md b/README.md index df95582..e027d69 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust ``` #### Overriding default flags @@ -163,11 +163,11 @@ Pull Requests should be made against the `develop` branch. Please cite BabelStream via this reference: -Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 +Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018. DOI: 10.1504/IJCSE.2018.095847 ### Other BabelStream publications -* Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 +* Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. DOI: 10.1007/978- 3-319-46079-6_34 * Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). diff --git a/src/.gitignore b/src/.gitignore index 568a953..9d8b17b 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -16,6 +16,8 @@ **/*.gz **/*.a +**/*.swp + **/KokkosCore_Config_* **/.DS_Store @@ -26,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 83b8c66..bb3f866 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -22,7 +22,7 @@ #endif #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 +#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90 new file mode 100644 index 0000000..5a8d5bc --- /dev/null +++ b/src/fortran/ArrayStream.F90 @@ -0,0 +1,105 @@ +module ArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=5), parameter :: implementation_name = "Array" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + A = initA + B = initB + C = initC + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + h_A = A + h_B = B + h_C = C + end subroutine read_arrays + + subroutine copy() + implicit none + C = A + end subroutine copy + + subroutine add() + implicit none + C = A + B + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + B = scalar * C + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = B + scalar * C + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = A + B + scalar * C + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + s = dot_product(A,B) + end function dot + +end module ArrayStream diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90 new file mode 100644 index 0000000..dd01d35 --- /dev/null +++ b/src/fortran/BabelStreamTypes.F90 @@ -0,0 +1,21 @@ +module BabelStreamTypes + use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 + + implicit none + +#ifdef USE_FLOAT + integer, parameter :: StreamRealKind = REAL32 + character(len=6) :: StreamRealName = "REAL32" +#else + integer, parameter :: StreamRealKind = REAL64 + character(len=6) :: StreamRealName = "REAL64" +#endif + +#ifdef USE_INT32 +#warning There is no checking for overflowing INT32, so be careful. + integer, parameter :: StreamIntKind = INT32 +#else + integer, parameter :: StreamIntKind = INT64 +#endif + +end module BabelStreamTypes diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90 new file mode 100644 index 0000000..01668ea --- /dev/null +++ b/src/fortran/CUDAKernelStream.F90 @@ -0,0 +1,230 @@ +module CUDAKernelStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "CUDAKernel" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + B(i) = scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer(kind=StreamIntKind) :: i + integer :: err + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAKernelStream diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90 new file mode 100644 index 0000000..208f1aa --- /dev/null +++ b/src/fortran/CUDAStream.F90 @@ -0,0 +1,309 @@ +module CUDAFortranKernels + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + contains + + attributes(global) subroutine do_copy(n,A,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + endif + end subroutine do_copy + + attributes(global) subroutine do_add(n,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + B(i) + endif + end subroutine do_add + + attributes(global) subroutine do_mul(n,scalar,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: B(n) + real(kind=REAL64), intent(in) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + B(i) = scalar * C(i) + endif + end subroutine do_mul + + attributes(global) subroutine do_triad(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = B(i) + scalar * C(i) + endif + end subroutine do_triad + + attributes(global) subroutine do_nstream(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine do_nstream + +#if 0 + attributes(global) subroutine do_dot(n,A,B,r) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: r + integer(kind=StreamIntKind) :: i + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + end subroutine do_dot +#endif + +end module CUDAFortranKernels + +module CUDAStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + use cudafor, only: dim3 + + implicit none + + character(len=4), parameter :: implementation_name = "CUDA" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + type(dim3) :: grid, tblock + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + ! move to separate subroutine later + tblock = dim3(128,1,1) + grid = dim3(ceiling(real(N)/tblock%x),1,1) + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use CUDAFortranKernels, only: do_copy + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_copy<<>>(N, A, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use CUDAFortranKernels, only: do_add + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_add<<>>(N, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use CUDAFortranKernels, only: do_mul + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_mul<<>>(N, scalar, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use CUDAFortranKernels, only: do_triad + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_triad<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use CUDAFortranKernels, only: do_nstream + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_nstream<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + !use CUDAFortranKernels, only: do_dot + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer :: err + integer(kind=StreamIntKind) :: i + !call do_dot<<>>(N, B, C, r) + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAStream diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90 new file mode 100644 index 0000000..781210d --- /dev/null +++ b/src/fortran/DoConcurrentStream.F90 @@ -0,0 +1,139 @@ +module DoConcurrentStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "DoConcurrent" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_DEVICE + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,C) + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(B,C) + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + ! reduction omitted because NVF infers it and other compilers do not support + s = real(0,kind=REAL64) +#ifdef CRAY_THREAD_DOCONCURRENT + do i=1,N +#else + do concurrent (i=1:N) !shared(A,B) +#endif + s = s + A(i) * B(i) + end do + end function dot + +end module DoConcurrentStream diff --git a/src/fortran/Makefile b/src/fortran/Makefile new file mode 100644 index 0000000..18685d4 --- /dev/null +++ b/src/fortran/Makefile @@ -0,0 +1,109 @@ +ifeq ($(COMPILER),nvhpc) + include make.inc.nvhpc +else ifeq ($(COMPILER),oneapi) + include make.inc.oneapi +else ifeq ($(COMPILER),gcc) + include make.inc.gcc +else ifeq ($(COMPILER),amd) + include make.inc.amd +else ifeq ($(COMPILER),arm) + include make.inc.arm +else ifeq ($(COMPILER),cray) + include make.inc.cray +else ifeq ($(COMPILER),fj) + include make.inc.fj +else + $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.) + include make.inc.gcc + COMPILER=gcc +endif + +FCFLAGS += -DVERSION_STRING="4.0" +#FCFLAGS += -DUSE_INT32 + +ifeq ($(IMPLEMENTATION),DoConcurrent) + FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG) + IMPLEMENTATION_OBJECT = DoConcurrentStream.o + +else ifeq ($(IMPLEMENTATION),Array) + FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG) + IMPLEMENTATION_OBJECT = ArrayStream.o + +else ifeq ($(IMPLEMENTATION),OpenMP) + FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPWorkshare) + FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTarget) + FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop) + FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o + +else ifeq ($(IMPLEMENTATION),OpenACC) + FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCStream.o + +else ifeq ($(IMPLEMENTATION),OpenACCArray) + FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCArrayStream.o + +else ifeq ($(IMPLEMENTATION),CUDA) + FCFLAGS += -DUSE_CUDA $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAStream.o + +else ifeq ($(IMPLEMENTATION),CUDAKernel) + FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAKernelStream.o + +else ifeq ($(IMPLEMENTATION),Sequential) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION_OBJECT = SequentialStream.o + +else + $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION=Sequential + IMPLEMENTATION_OBJECT = SequentialStream.o + +endif + +all: BabelStream.$(COMPILER).$(IMPLEMENTATION) + +BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT) + $(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@ + +BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90 + $(FC) $(FCFLAGS) -c $< + +%.o: %.F90 BabelStreamTypes.mod + $(FC) $(FCFLAGS) -c $< + +clean: + -rm -f main.o BabelStreamUtil.mod babelstreamutil.mod + -rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod + -rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod + -rm -f ArrayStream.o ArrayStream.mod arraystream.mod + -rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod + -rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod + -rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod + -rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod + -rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod + -rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod + -rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod + -rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod + -rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod + -rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod + -rm -f *.modmic *.mod *.o *.cub *.ptx + +realclean: clean + -rm -f BabelStream.* diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90 new file mode 100644 index 0000000..9225fe7 --- /dev/null +++ b/src/fortran/OpenACCArrayStream.F90 @@ -0,0 +1,144 @@ +module OpenACCArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenACCArray" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$acc kernels + A = initA + B = initB + C = initC + !$acc end kernels + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$acc kernels + h_A = A + h_B = B + h_C = C + !$acc end kernels + end subroutine read_arrays + + subroutine copy() + implicit none + !$acc kernels + C = A + !$acc end kernels + end subroutine copy + + subroutine add() + implicit none + !$acc kernels + C = A + B + !$acc end kernels + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + B = scalar * C + !$acc end kernels + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = B + scalar * C + !$acc end kernels + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = A + B + scalar * C + !$acc end kernels + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$acc kernels + s = dot_product(A,B) + !$acc end kernels + end function dot + +end module OpenACCArrayStream diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90 new file mode 100644 index 0000000..7326f38 --- /dev/null +++ b/src/fortran/OpenACCStream.F90 @@ -0,0 +1,161 @@ +module OpenACCStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=7), parameter :: implementation_name = "OpenACC" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$acc parallel loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenACCStream diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90 new file mode 100644 index 0000000..7316d5b --- /dev/null +++ b/src/fortran/OpenMPStream.F90 @@ -0,0 +1,137 @@ +module OpenMPStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=6), parameter :: implementation_name = "OpenMP" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPStream diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90 new file mode 100644 index 0000000..9684ced --- /dev/null +++ b/src/fortran/OpenMPTargetLoopStream.F90 @@ -0,0 +1,162 @@ +module OpenMPTargetLoopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=16), parameter :: implementation_name = "OpenMPTargetLoop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetLoopStream diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90 new file mode 100644 index 0000000..0206d78 --- /dev/null +++ b/src/fortran/OpenMPTargetStream.F90 @@ -0,0 +1,163 @@ +module OpenMPTargetStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenMPTarget" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams distribute parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + end do + !$omp barrier + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams distribute parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetStream diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90 new file mode 100644 index 0000000..579a761 --- /dev/null +++ b/src/fortran/OpenMPTaskloopStream.F90 @@ -0,0 +1,169 @@ +module OpenMPTaskloopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=14), parameter :: implementation_name = "OpenMPTaskloop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + !$omp end master + !$omp end parallel + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + !$omp end master + !$omp end parallel + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + end do + !$omp end master + !$omp end parallel + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + B(i) + end do + !$omp end master + !$omp end parallel + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + B(i) = scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel + !$omp master + !$omp taskloop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + !$omp end master + !$omp end parallel + end function dot + +end module OpenMPTaskloopStream diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90 new file mode 100644 index 0000000..fd50f86 --- /dev/null +++ b/src/fortran/OpenMPWorkshareStream.F90 @@ -0,0 +1,120 @@ +module OpenMPWorkshareStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=15), parameter :: implementation_name = "OpenMPWorkshare" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$omp parallel workshare + A = initA + B = initB + C = initC + !$omp end parallel workshare + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$omp parallel workshare + h_A = A + h_B = B + h_C = C + !$omp end parallel workshare + end subroutine read_arrays + + subroutine copy() + implicit none + !$omp parallel workshare + C = A + !$omp end parallel workshare + end subroutine copy + + subroutine add() + implicit none + !$omp parallel workshare + C = A + B + !$omp end parallel workshare + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + B = scalar * C + !$omp end parallel workshare + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = B + scalar * C + !$omp end parallel workshare + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = A + B + scalar * C + !$omp end parallel workshare + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$omp parallel workshare + s = dot_product(A,B) + !$omp end parallel workshare + end function dot + +end module OpenMPWorkshareStream diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90 new file mode 100644 index 0000000..a8f6917 --- /dev/null +++ b/src/fortran/SequentialStream.F90 @@ -0,0 +1,130 @@ +module SequentialStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "Sequential" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module SequentialStream diff --git a/src/fortran/build.sh b/src/fortran/build.sh new file mode 100755 index 0000000..9343354 --- /dev/null +++ b/src/fortran/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# uncomment to disable GPU targets +#HAS_GPU=0 + +# Orin +#if [ "x${compiler}" == "xgcc" ] ; then +# export MCPU=cortex-a78ae +#fi +#if [ "x${compiler}" == "xarm" ] ; then +# export MCPU=cortex-a78 +#fi + +COMPILERS="gcc" +if [ $(which nvfortran) ] ; then + COMPILERS="${COMPILERS} nvhpc" +fi +if [ $(which crayftn) ] ; then + COMPILERS="${COMPILERS} cray" +fi +if [ $(uname -m) == "aarch64" ] ; then + if [ $(which armflang) ] ; then + COMPILERS="${COMPILERS} arm" + fi + if [ $(which frt) ] ; then + COMPILERS="${COMPILERS} fj" + fi +elif [ $(uname -m) == "x86_64" ] ; then + if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then + COMPILERS="${COMPILERS} oneapi" + if [ -f /opt/intel/oneapi/setvars.sh ] ; then + . /opt/intel/oneapi/setvars.sh >& /dev/null + fi + else + # ^ this detection can be improved + COMPILERS="${COMPILERS} amd" + fi +fi + +for compiler in ${COMPILERS} ; do + TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" + if [ "${HAS_GPU}" != "0" ] ; then + TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" + if [ "x${compiler}" == "xnvhpc" ] ; then + TARGETS="${TARGETS} CUDA CUDAKernel" + fi + fi + if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then + TARGETS="${TARGETS} OpenACC OpenACCArray" + fi + for implementation in ${TARGETS} ; do + make COMPILER=${compiler} IMPLEMENTATION=${implementation} + done +done diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 new file mode 100644 index 0000000..d86e8d4 --- /dev/null +++ b/src/fortran/main.F90 @@ -0,0 +1,666 @@ +module BabelStreamUtil + use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 + use BabelStreamTypes + + implicit none + + integer(kind=StreamIntKind) :: array_size = 33554432 + integer(kind=StreamIntKind) :: num_times = 100 + logical :: mibibytes = .false. + logical :: use_gigs = .false. + logical :: csv = .false. + character(len=1), parameter :: csv_sep = "," + + ! 1 = All + ! 2 = Triad + ! 3 = Nstream + integer :: selection = 1 + + real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64) + real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64) + real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64) + real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64) + + contains + + function get_wtime() result(t) +#if defined(USE_OMP_GET_WTIME) + use omp_lib + implicit none + real(kind=REAL64) :: t + t = omp_get_wtime() +#elif defined(USE_CPU_TIME) + implicit none + real(kind=REAL64) :: t + real :: r + call cpu_time(r) + t = r +#else + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +#endif + end function get_wtime + + subroutine parseArguments() + use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream, only: list_devices, set_device +#elif defined(USE_ARRAY) + use ArrayStream, only: list_devices, set_device +#elif defined(USE_OPENMP) + use OpenMPStream, only: list_devices, set_device +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream, only: list_devices, set_device +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream, only: list_devices, set_device +#elif defined(USE_OPENACC) + use OpenACCStream, only: list_devices, set_device +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream, only: list_devices, set_device +#elif defined(USE_CUDA) + use CUDAStream, only: list_devices, set_device +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream, only: list_devices, set_device +#elif defined(USE_SEQUENTIAL) + use SequentialStream, only: list_devices, set_device +#endif + implicit none + integer :: i, argc + integer :: arglen,err,pos(2) + character(len=64) :: argtmp + argc = command_argument_count() + do i=1,argc + call get_command_argument(i,argtmp,arglen,err) + if (err.eq.0) then + ! + ! list devices + ! + pos(1) = index(argtmp,"--list") + if (pos(1).eq.1) then + call list_devices() + stop + endif + ! + ! set device number + ! + pos(1) = index(argtmp,"--device") + if (pos(1).eq.1) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + stop + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer :: dev + read(argtmp,'(i15)') dev + call set_device(dev) + end block + endif + cycle + endif + ! + ! array size + ! + pos(1) = index(argtmp,"--arraysize") + pos(2) = index(argtmp,"-s") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer(kind=INT64) :: big_size + read(argtmp,'(i15)') big_size + if (big_size .gt. HUGE(array_size)) then + print*,'Array size does not fit into integer:' + print*,big_size,'>',HUGE(array_size) + print*,'Stop using USE_INT32' + stop + else + array_size = INT(big_size,kind=StreamIntKind) + endif + end block + endif + cycle + endif + ! + ! number of iterations + ! + pos(1) = index(argtmp,"--numtimes") + pos(2) = index(argtmp,"-n") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + read(argtmp,'(i15)') num_times + if (num_times.lt.2) then + write(*,'(a)') "Number of times must be 2 or more" + stop + end if + endif + cycle + endif + ! + ! precision + ! + pos(1) = index(argtmp,"--float") + if (pos(1).eq.1) then + write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", & + "to run BabelStream in single precision." + stop + endif + ! + ! selection (All, Triad, Nstream) + ! + pos(1) = index(argtmp,"--triad-only") + if (pos(1).eq.1) then + selection = 2 + cycle + endif + pos(1) = index(argtmp,"--nstream-only") + if (pos(1).eq.1) then + selection = 3 + cycle + endif + ! + ! CSV + ! + pos(1) = index(argtmp,"--csv") + if (pos(1).eq.1) then + csv = .true. + !write(*,'(a39)') "Sorry, CSV support isn't available yet." + !stop + endif + ! + ! units + ! + pos(1) = index(argtmp,"--mibibytes") + if (pos(1).eq.1) then + mibibytes = .true. + cycle + endif + ! + ! giga/gibi instead of mega/mebi + ! + pos(1) = index(argtmp,"--gigs") + if (pos(1).eq.1) then + use_gigs = .true. + cycle + endif + ! + ! + ! + pos(1) = index(argtmp,"--compiler-info") + if (pos(1).eq.1) then + write(*,'(a)') 'Compiler version: ',compiler_version() + write(*,'(a)') 'Compiler options: ',compiler_options() + stop + endif + ! + ! help + ! + pos(1) = index(argtmp,"--help") + pos(2) = index(argtmp,"-h") + if (any(pos(:).eq.1) ) then + call get_command_argument(0,argtmp,arglen,err) + write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]" + write(*,'(a)') "Options:" + write(*,'(a)') " -h --help Print the message" + write(*,'(a)') " --list List available devices" + write(*,'(a)') " --device INDEX Select device at INDEX" + write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array" + write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)" + !write(*,'(a)') " --float Use floats (rather than doubles)" + write(*,'(a)') " --triad-only Only run triad" + write(*,'(a)') " --nstream-only Only run nstream" + write(*,'(a)') " --csv Output as csv table" + write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB" + write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit." + stop + endif + end if + end do + end subroutine parseArguments + + subroutine run_all(timings, summ) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64), intent(out) :: summ + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call copy() + t2 = get_wtime() + timings(1,i) = t2-t1 + + t1 = get_wtime() + call mul(startScalar) + t2 = get_wtime() + timings(2,i) = t2-t1 + + t1 = get_wtime() + call add() + t2 = get_wtime() + timings(3,i) = t2-t1 + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(4,i) = t2-t1 + + t1 = get_wtime() + summ = dot() + t2 = get_wtime() + timings(5,i) = t2-t1 + + end do + + end subroutine run_all + + subroutine run_triad(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_triad + + subroutine run_nstream(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call nstream(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_nstream + + subroutine check_solution(A, B, C, summ) + use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal + implicit none + real(kind=REAL64), intent(in) :: A(:), B(:), C(:) + real(kind=REAL64), intent(in) :: summ + + integer(kind=StreamIntKind) :: i + real(kind=REAL64) :: goldA, goldB, goldC, goldSum + real(kind=REAL64) :: scalar + + ! always use double because of accumulation error + real(kind=REAL64) :: errA, errB, errC, errSum, epsi + logical :: cleanA, cleanB, cleanC, cleanSum + + goldA = startA + goldB = startB + goldC = startC + goldSum = 0.0d0 + + scalar = startScalar + + do i=1,num_times + + if (selection.eq.1) then + goldC = goldA + goldB = scalar * goldC + goldC = goldA + goldB + goldA = goldB + scalar * goldC + else if (selection.eq.2) then + goldA = goldB + scalar * goldC + else if (selection.eq.3) then + goldA = goldA + goldB + scalar * goldC; + endif + + end do + + goldSum = goldA * goldB * array_size + + cleanA = ALL(IEEE_Is_Normal(A)) + cleanB = ALL(IEEE_Is_Normal(B)) + cleanC = ALL(IEEE_Is_Normal(C)) + cleanSum = IEEE_Is_Normal(summ) + + if (.not. cleanA) then + write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanB) then + write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanC) then + write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanSum) then + write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ + end if + + errA = SUM( ABS( A - goldA ) ) / array_size + errB = SUM( ABS( B - goldB ) ) / array_size + errC = SUM( ABS( C - goldC ) ) / array_size + errSum = ABS( (summ - goldSum) / goldSum) + + epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0 + + if (errA .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA + end if + if (errB .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB + end if + if (errC .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC + end if + + if (selection.eq.1) then + if (errSum .gt. 1.0e-8) then + write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum + write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum + end if + endif + + end subroutine check_solution + +end module BabelStreamUtil + +program BabelStream + use BabelStreamUtil +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + integer :: element_size, err + real(kind=REAL64) :: scaling + character(len=3) :: label + real(kind=REAL64), allocatable :: timings(:,:) + real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) + real(kind=REAL64) :: summ + + call parseArguments() + + element_size = storage_size(real(0,kind=StreamRealKind)) / 8 + + if (mibibytes) then + if (use_gigs) then + scaling = 2.0d0**(-30) + label = "GiB" + else + scaling = 2.0d0**(-20) + label = "MiB" + endif + else + if (use_gigs) then + scaling = 1.0d-9 + label = "GB" + else + scaling = 1.0d-6 + label = "MB" + endif + endif + + if (.not.csv) then + + write(*,'(a)') "BabelStream Fortran" + write(*,'(a9,f4.1)') "Version: ", VERSION_STRING + write(*,'(a16,a)') "Implementation: ", implementation_name + + block + character(len=32) :: printout + write(printout,'(i9,1x,a5)') num_times,'times' + write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout) + end block + write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName) + + write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label + write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label + + endif ! csv + + allocate( timings(5,num_times) ) + + call alloc(array_size) + + call init_arrays(startA, startB, startC) + summ = 0.0d0 + + timings = -1.0d0 + if (selection.eq.1) then + call run_all(timings, summ) + else if (selection.eq.2) then + call run_triad(timings) + else if (selection.eq.3) then + call run_nstream(timings) + endif + + allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + + call read_arrays(h_A, h_B, h_C) + call check_solution(h_A, h_B, h_C, summ) + + block + character(len=20) :: printout(8) + real(kind=REAL64) :: tmin,tmax,tavg,nbytes + + if (csv) then + write(*,'(a,a1)',advance='no') 'function', csv_sep + write(*,'(a,a1)',advance='no') 'num_times', csv_sep + write(*,'(a,a1)',advance='no') 'n_elements',csv_sep + write(*,'(a,a1)',advance='no') 'sizeof', csv_sep + if (mibibytes) then + write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep + else + write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep + endif + write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep + write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep + write(*,'(a,a1)',advance='yes') 'avg_runtime' + else + write(printout(1),'(a8)') 'Function' + write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec' + write(printout(3),'(a9)') 'Min (sec)' + write(printout(4),'(a3)') 'Max' + write(printout(5),'(a7)') 'Average' + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif ! csv + + if (selection.eq.1) then + block + integer, parameter :: sizes(5) = [2,2,3,3,2] + character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "] + integer :: i + do i=1,5 + tmin = MINVAL(timings(i,2:num_times)) + tmax = MAXVAL(timings(i,2:num_times)) + tavg = SUM(timings(i,2:num_times)) / (num_times-1) + nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i) + write(printout(1),'(a)') labels(i) + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + enddo + end block + else if ((selection.eq.2).or.(selection.eq.3)) then + tmin = MINVAL(timings(1,2:num_times)) + tmax = MAXVAL(timings(1,2:num_times)) + tavg = SUM(timings(1,2:num_times)) / (num_times-1) + if (selection.eq.2) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 3 + write(printout(1),'(a12)') "Triad" + else if (selection.eq.3) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 4 + write(printout(1),'(a12)') "Nstream" + endif + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + endif + end block + + call dealloc() + +end program BabelStream diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd new file mode 100644 index 0000000..a863de8 --- /dev/null +++ b/src/fortran/make.inc.amd @@ -0,0 +1,25 @@ +FC := /opt/rocm/llvm/bin/flang +FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang +FCFLAGS := -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +ifdef MARCH +FCFLAGS += -march=$(MARCH) +else +FCFLAGS += -march=native +endif + +DOCONCURRENT_FLAG = -fopenmp # libomp.so required +ARRAY_FLAG = -fopenmp # libomp.so required +OPENMP_FLAG = -fopenmp +#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm new file mode 100644 index 0000000..a3e2a67 --- /dev/null +++ b/src/fortran/make.inc.arm @@ -0,0 +1,39 @@ +FC = armflang +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -fopenmp +ARRAY_FLAG = -fopenmp +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray new file mode 100644 index 0000000..dae4e75 --- /dev/null +++ b/src/fortran/make.inc.cray @@ -0,0 +1,18 @@ +FC := ftn +FCFLAGS = -e F -O3 + +DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT +ARRAY_FLAG = -h autothread +OPENMP_FLAG = -h omp +OPENACC_FLAG = -h acc +# CPU only +OPENACC_FLAG += -h omp +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj new file mode 100644 index 0000000..b4761e5 --- /dev/null +++ b/src/fortran/make.inc.fj @@ -0,0 +1,21 @@ +FC := frt +FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution + +DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED +ARRAY_FLAG = -Kparallel,reduction +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = +# CPU only +OPENACC_FLAG += +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OPENACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc new file mode 100644 index 0000000..f59c8bb --- /dev/null +++ b/src/fortran/make.inc.gcc @@ -0,0 +1,33 @@ +FC = gfortran +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 +ARRAY_FLAG = +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc new file mode 100644 index 0000000..dd4c442 --- /dev/null +++ b/src/fortran/make.inc.nvhpc @@ -0,0 +1,70 @@ +FC := nvfortran +#FCFLAGS := -O3 -Minform=inform -Minfo=all +FCFLAGS := -O3 -Minform=warn + +#TARGET=gpu +TARGET=multicore + +NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") +ifeq ($(findstring Ampere,$(NVARCH)),Ampere) + $(info Ampere detected) + GPU = cc80 +endif +ifeq ($(findstring Turing,$(NVARCH)),Turing) + $(info Turing detected) + GPU = cc75 +endif +ifeq ($(findstring Volta,$(NVARCH)),Volta) + $(info Volta detected) + GPU = cc70 +endif +ifeq ($(findstring Pascal,$(NVARCH)),Pascal) + $(info Pascal detected) + GPU = cc60,cc61 +endif +ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) + $(info Jetson AGX Orin detected) + GPU = ccn87,cc86 + # figure out Xavier later + #GPU = cc72 +endif +ifeq ($(GPU),) + $(error Your GPU architecture could not be detected. Set it manually.) +endif +GPUFLAG = -gpu=$(GPU) + +# MARCH=neoverse-v1,neoverse-n1,zen3 +ARCH=$(shell uname -m) +ifdef MARCH + ifeq ($(ARCH),aarch64) + ifeq ($(MARCH),neoverse-n1) + FCFLAGS += -tp=$(MARCH) + else + ifeq ($(MARCH),neoverse-v1) + FCFLAGS += -tp=$(MARCH) + else + FCFLAGS += -tp=native + endif + endif + else + FCFLAGS += -tp=$(MARCH) + endif +else + FCFLAGS += -tp=native +endif + +# this is to allow apples-to-apples comparison with DC in non-DC GPU impls +# set exactly one of these! +#MANAGED = -DUSE_MANAGED -gpu=managed +#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged + +DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) +ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) +OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) +OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) +CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi new file mode 100644 index 0000000..b7e003c --- /dev/null +++ b/src/fortran/make.inc.oneapi @@ -0,0 +1,32 @@ +FC := ifx +FCFLAGS = -std18 +FCFLAGS += -Ofast -xHOST +FCFLAGS += -qopt-zmm-usage=low + +ifeq ($(FC),ifort) + FCFLAGS += -qopt-streaming-stores=always + PARALLEL = -parallel +endif + +DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) +ARRAY_FLAG = -qopenmp $(PARALLEL) +OPENMP_FLAG = -qopenmp +ifeq ($(FC),ifx) + OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 +endif +OPENACC_FLAG = +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/run.sh b/src/fortran/run.sh new file mode 100755 index 0000000..2b41bab --- /dev/null +++ b/src/fortran/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +cat ./run.sh + +if [ `uname -s` == Darwin ] ; then + NUM_HWTHREADS=`sysctl -n hw.ncpu` + MEMORY_BYTES=`sysctl -n hw.memsize` +else + NUM_HWTHREADS=`nproc` + MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` +fi + +M=128 + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=close +export OMP_PLACES=threads + +export ACC_NUM_CORES=${OMP_NUM_THREADS} + +AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" + +for compiler in gcc nvhpc cray oneapi arm amd fj ; do + #if [ "x$compiler" == "xgcc" ] ; then + # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so + #fi + for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do + if [ -f BabelStream.${compiler}.${implementation} ] ; then + echo "BabelStream.${compiler}.${implementation}" + ldd BabelStream.${compiler}.${implementation} + time $AFFCONTROL \ + ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) + fi + done +done diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 6aed1ee..0db8485 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -9,7 +9,7 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 + void check_error(void) { @@ -47,9 +47,15 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; + // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) + dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); - // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. + // This creates an array on the host which is visible to the device. However, it requires + // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host + // after it has been passed through to a kernel. + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; @@ -64,15 +70,14 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); - check_error(); } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -80,15 +85,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -97,7 +100,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); hipDeviceSynchronize(); check_error(); @@ -115,18 +118,17 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } - template __global__ void copy_kernel(const T * a, T * c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -136,14 +138,14 @@ template __global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -152,14 +154,14 @@ void HIPStream::mul() template __global__ void add_kernel(const T * a, const T * b, T * c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -169,14 +171,14 @@ template __global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -186,32 +188,32 @@ template __global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { __syncthreads(); if (local_i < offset) @@ -221,20 +223,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) } if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); - - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); check_error(); T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 44a2893..3c603e0 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,13 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { + // Make sure that either: + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + // Take into account the datatype size + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + protected: // Size of arrays int array_size; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; @@ -29,7 +47,6 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; public: diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 04e0daf..158e109 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index d7333a7..a410a86 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,7 +10,6 @@ #include #include - #include "Stream.h" #define IMPLEMENTATION_STRING "Kokkos" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 46c773d..7457eeb 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,32 +1,38 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") -register_flag_required(KOKKOS_IN_TREE +register_flag_optional(KOKKOS_IN_TREE "Absolute path to the *source* distribution directory of Kokkos. Remember to append Kokkos specific flags as well, for example: - -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... + See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") - See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") +register_flag_optional(KOKKOS_IN_PACKAGE + "Absolute path to package R-Path containing Kokkos libs. + Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") if (EXISTS "${KOKKOS_IN_TREE}") + message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) - else () - message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") + elseif (EXISTS "${KOKKOS_IN_PACKAGE}") + message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") + find_package(Kokkos REQUIRED) + register_link_library(Kokkos::kokkos) + else() + message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") endif () register_append_compiler_and_arch_specific_cxx_flags( @@ -36,5 +42,3 @@ macro(setup) ) endmacro() - - diff --git a/src/raja/model.cmake b/src/raja/model.cmake index 7e5f284..b1e7750 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -1,18 +1,19 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" "c++") -register_flag_required(RAJA_IN_TREE +register_flag_optional(RAJA_IN_TREE "Absolute path to the *source* distribution directory of RAJA. Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: - -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options -") +" "") + +register_flag_optional(RAJA_IN_PACKAGE + "Use if Raja is part of a package dependency: + Path to installation" "") register_flag_optional(TARGET "Target offload device, implemented values are CPU, NVIDIA" @@ -76,16 +77,22 @@ macro(setup) register_link_library(RAJA) # RAJA's cmake screws with where the binary will end up, resetting it here: set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + + elseif (EXISTS "${RAJA_IN_PACKAGE}") + message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`") + find_package(RAJA REQUIRED) + register_link_library(RAJA) + else () - message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") + message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists") endif () if (ENABLE_CUDA) # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu enable_language(CUDA) - set_source_files_properties(RAJAStream.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(src/raja/RAJAStream.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA) endif () diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index 5f225f0..723849a 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -4,13 +4,131 @@ version = 3 [[package]] name = "ansi_term" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "async-channel" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "once_cell", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7" +dependencies = [ + "autocfg", + "concurrent-queue", + "futures-lite", + "libc", + "log", + "once_cell", + "parking", + "polling", + "slab", + "socket2", + "waker-fn", + "winapi 0.3.9", +] + +[[package]] +name = "async-lock" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-task" +version = "4.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524" + +[[package]] +name = "atomic-waker" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a" + [[package]] name = "atty" version = "0.2.14" @@ -24,9 +142,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -34,6 +152,38 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blocking" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc" +dependencies = [ + "async-channel", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "once_cell", +] + +[[package]] +name = "bumpalo" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" + +[[package]] +name = "cache-padded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + [[package]] name = "cfg-if" version = "1.0.0" @@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "2.33.3" +version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", @@ -64,6 +214,15 @@ dependencies = [ "crossterm", ] +[[package]] +name = "concurrent-queue" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +dependencies = [ + "cache-padded", +] + [[package]] name = "core_affinity" version = "0.5.10" @@ -78,9 +237,9 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ "cfg-if", "crossbeam-channel", @@ -92,9 +251,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -102,9 +261,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -113,22 +272,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.5" +version = "0.9.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" dependencies = [ + "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.2" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" dependencies = [ "cfg-if", "crossbeam-utils", @@ -136,12 +295,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.5" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac" dependencies = [ "cfg-if", - "lazy_static", ] [[package]] @@ -170,10 +328,157 @@ dependencies = [ ] [[package]] -name = "either" -version = "1.6.1" +name = "ctor" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "futures" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" + +[[package]] +name = "futures-executor" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" + +[[package]] +name = "futures-lite" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-macro" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" + +[[package]] +name = "futures-task" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gloo-timers" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] [[package]] name = "heck" @@ -202,6 +507,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -212,6 +526,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.108" +version = "0.2.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" +checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" [[package]] name = "lock_api" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", + "value-bag", ] [[package]] -name = "memoffset" -version = "0.6.4" +name = "memchr" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" dependencies = [ "autocfg", ] @@ -275,32 +606,44 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" dependencies = [ "winapi 0.3.9", ] [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", "libc", ] +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "parking" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" + [[package]] name = "parking_lot" version = "0.11.2" @@ -327,12 +670,29 @@ dependencies = [ ] [[package]] -name = "pest" -version = "2.1.3" +name = "pin-project-lite" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polling" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011" dependencies = [ - "ucd-trie", + "autocfg", + "cfg-if", + "libc", + "log", + "wepoll-ffi", + "winapi 0.3.9", ] [[package]] @@ -361,27 +721,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.32" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.10" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -391,31 +751,43 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "rstest" -version = "0.10.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc" +dependencies = [ + "async-std", + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116" dependencies = [ "cfg-if", "proc-macro2", @@ -426,7 +798,7 @@ dependencies = [ [[package]] name = "rust-stream" -version = "3.4.0" +version = "4.0.0" dependencies = [ "colour", "core_affinity", @@ -443,18 +815,18 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustversion" -version = "1.0.5" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "scopeguard" @@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semver" -version = "0.11.0" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "signal-hook" @@ -501,10 +861,29 @@ dependencies = [ ] [[package]] -name = "smallvec" -version = "1.7.0" +name = "slab" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" + +[[package]] +name = "socket2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +dependencies = [ + "libc", + "winapi 0.3.9", +] [[package]] name = "strsim" @@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ "clap", "lazy_static", @@ -538,20 +917,20 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.82" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] name = "tabular" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1" dependencies = [ "unicode-width", ] @@ -566,28 +945,32 @@ dependencies = [ ] [[package]] -name = "ucd-trie" -version = "0.1.3" +name = "unicode-ident" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" [[package]] name = "unicode-segmentation" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" +checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] -name = "unicode-xid" -version = "0.2.2" +name = "value-bag" +version = "1.0.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" +dependencies = [ + "ctor", + "version_check", +] [[package]] name = "vec_map" @@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wepoll-ffi" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb" +dependencies = [ + "cc", +] [[package]] name = "winapi" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml index 8ac456f..d93a84f 100644 --- a/src/rust/rust-stream/Cargo.toml +++ b/src/rust/rust-stream/Cargo.toml @@ -7,19 +7,19 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -num-traits = "0.2.14" -structopt = "0.3.13" -tabular = "0.1.4" -rayon = "1.5.1" -crossbeam = "0.8.1" -num_cpus = "1.13.0" -rustversion = "1.0" -libc = "0.2.97" +num-traits = "0.2.15" +structopt = "0.3.26" +tabular = "0.2.0" +rayon = "1.5.3" +crossbeam = "0.8.2" +num_cpus = "1.13.1" +rustversion = "1.0.9" +libc = "0.2.134" core_affinity = "0.5.10" colour = "0.6.0" [dev-dependencies] -rstest = "0.10.0" +rstest = "0.13.0" [build-dependencies] rustversion = "1.0" diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake index e7b5a1c..c60b9c9 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020/model.cmake @@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") @@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -65,6 +67,12 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") set(CMAKE_CXX_COMPILER dpcpp) register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 3a57ab0..f15a392 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -145,7 +145,7 @@ T ThrustStream::dot() #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ (defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM_HIP == THRUST_DEVICE_SYSTEM) -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__NVCOMPILER_CUDA__) #define IMPL_FN__(fn) cuda ## fn #define IMPL_TYPE__(tpe) cuda ## tpe #elif defined(__HIP_PLATFORM_HCC__)