diff --git a/results/v2.0/acc-cray/K20X.txt b/results/v2.0/acc-cray/K20X.txt new file mode 100644 index 0000000..123006f --- /dev/null +++ b/results/v2.0/acc-cray/K20X.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 116702.138 0.00460 0.00462 0.00461 +Mul 174325.378 0.00308 0.00310 0.00309 +Add 261591.864 0.00308 0.00310 0.00309 +Triad 175206.996 0.00460 0.00462 0.00461 +Application 1396462 resources: utime ~2s, stime ~1s, Rss ~876780, inblocks ~601, outblocks ~323 diff --git a/results/v2.0/acc-cray/K40.txt b/results/v2.0/acc-cray/K40.txt new file mode 100644 index 0000000..8e85c0c --- /dev/null +++ b/results/v2.0/acc-cray/K40.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 120256.706 0.00446 0.00458 0.00448 +Mul 185251.125 0.00290 0.00292 0.00291 +Add 277727.285 0.00290 0.00388 0.00292 +Triad 181094.123 0.00445 0.00448 0.00446 diff --git a/results/v2.0/acc-cray/K80.txt b/results/v2.0/acc-cray/K80.txt new file mode 100644 index 0000000..e0f40cf --- /dev/null +++ b/results/v2.0/acc-cray/K80.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 113606.114 0.00473 0.00560 0.00476 +Mul 175729.637 0.00306 0.00367 0.00309 +Add 263518.810 0.00306 0.00369 0.00309 +Triad 170709.791 0.00472 0.00573 0.00477 diff --git a/results/v2.0/acc-pgi-kernel/broadwell.txt b/results/v2.0/acc-pgi-kernel/broadwell.txt new file mode 100644 index 0000000..19d4a9e --- /dev/null +++ b/results/v2.0/acc-pgi-kernel/broadwell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 37575.728 0.01429 0.01440 0.01434 +Mul 37848.869 0.01418 0.01433 0.01425 +Add 45589.244 0.01766 0.01781 0.01772 +Triad 46657.287 0.01726 0.01736 0.01730 +Application 1454136 resources: utime ~287s, stime ~1s, Rss ~789384, inblocks ~374, outblocks ~387 diff --git a/results/v2.0/acc-pgi-kernel/haswell.txt b/results/v2.0/acc-pgi-kernel/haswell.txt new file mode 100644 index 0000000..6c99db6 --- /dev/null +++ b/results/v2.0/acc-pgi-kernel/haswell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 50148.428 0.01071 0.01077 0.01073 +Mul 51051.215 0.01052 0.01059 0.01055 +Add 55360.902 0.01455 0.01462 0.01458 +Triad 54556.116 0.01476 0.01486 0.01481 +Application 1454139 resources: utime ~167s, stime ~1s, Rss ~789120, inblocks ~316, outblocks ~387 diff --git a/results/v2.0/acc-pgi-kernel/ivybridge.txt b/results/v2.0/acc-pgi-kernel/ivybridge.txt new file mode 100644 index 0000000..1757eb5 --- /dev/null +++ b/results/v2.0/acc-pgi-kernel/ivybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 82520.672 0.00651 0.01660 0.00835 +Mul 80283.532 0.00669 0.01680 0.00862 +Add 84622.881 0.00952 0.02310 0.01205 +Triad 83561.609 0.00964 0.02359 0.01253 diff --git a/results/v2.0/acc-pgi-kernel/sandybridge.txt b/results/v2.0/acc-pgi-kernel/sandybridge.txt new file mode 100644 index 0000000..02800a2 --- /dev/null +++ b/results/v2.0/acc-pgi-kernel/sandybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 28718.686 0.01869 0.03758 0.01919 +Mul 27994.499 0.01918 0.02873 0.01984 +Add 28014.895 0.02875 0.03684 0.02941 +Triad 28070.552 0.02869 0.04073 0.02952 diff --git a/results/v2.0/acc-pgi-loops/980Ti.txt b/results/v2.0/acc-pgi-loops/980Ti.txt new file mode 100644 index 0000000..f9ae34d --- /dev/null +++ b/results/v2.0/acc-pgi-loops/980Ti.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 258666.277 0.00208 0.00208 0.00208 +Mul 258730.975 0.00208 0.00208 0.00208 +Add 265497.286 0.00303 0.00304 0.00304 +Triad 266836.306 0.00302 0.00303 0.00302 diff --git a/results/v2.0/acc-pgi-loops/IvyBridge.txt b/results/v2.0/acc-pgi-loops/IvyBridge.txt new file mode 100644 index 0000000..0b68e04 --- /dev/null +++ b/results/v2.0/acc-pgi-loops/IvyBridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 77879.649 0.00689 0.02005 0.00893 +Mul 59127.697 0.00908 0.01900 0.01069 +Add 63608.223 0.01266 0.02617 0.01511 +Triad 64017.868 0.01258 0.02615 0.01513 diff --git a/results/v2.0/acc-pgi-loops/S9150.txt b/results/v2.0/acc-pgi-loops/S9150.txt new file mode 100644 index 0000000..98e1f05 --- /dev/null +++ b/results/v2.0/acc-pgi-loops/S9150.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 256996.540 0.00209 0.00217 0.00212 +Mul 258109.030 0.00208 0.00215 0.00212 +Add 265716.643 0.00303 0.00315 0.00306 +Triad 265499.387 0.00303 0.00307 0.00306 diff --git a/results/v2.0/acc-pgi-loops/SandyBridge.txt b/results/v2.0/acc-pgi-loops/SandyBridge.txt new file mode 100644 index 0000000..c51a93c --- /dev/null +++ b/results/v2.0/acc-pgi-loops/SandyBridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 28017.895 0.01916 0.01995 0.01942 +Mul 22199.944 0.02418 0.02536 0.02469 +Add 23458.271 0.03433 0.03588 0.03497 +Triad 23644.549 0.03406 0.03532 0.03465 diff --git a/results/v2.0/acc-pgi-loops/broadwell.txt b/results/v2.0/acc-pgi-loops/broadwell.txt new file mode 100644 index 0000000..a84c91a --- /dev/null +++ b/results/v2.0/acc-pgi-loops/broadwell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 34338.639 0.01563 0.01574 0.01569 +Mul 38249.125 0.01404 0.01413 0.01408 +Add 40155.137 0.02005 0.02015 0.02009 +Triad 40995.666 0.01964 0.01971 0.01967 +Application 1396691 resources: utime ~357s, stime ~1s, Rss ~789348, inblocks ~365, outblocks ~350 diff --git a/results/v2.0/acc-pgi-loops/haswell.txt b/results/v2.0/acc-pgi-loops/haswell.txt new file mode 100644 index 0000000..8d46a72 --- /dev/null +++ b/results/v2.0/acc-pgi-loops/haswell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 46401.864 0.01157 0.01163 0.01160 +Mul 40767.924 0.01317 0.01326 0.01321 +Add 42062.027 0.01915 0.01925 0.01920 +Triad 42666.377 0.01887 0.01900 0.01893 +Application 1396695 resources: utime ~240s, stime ~1s, Rss ~789084, inblocks ~307, outblocks ~350 diff --git a/results/v2.0/cuda-cray/K40.txt b/results/v2.0/cuda-cray/K40.txt new file mode 100644 index 0000000..a274786 --- /dev/null +++ b/results/v2.0/cuda-cray/K40.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K40m +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 194135.310 0.00277 0.00278 0.00277 +Mul 194049.073 0.00277 0.00280 0.00278 +Add 190956.372 0.00422 0.00423 0.00422 +Triad 190822.844 0.00422 0.00423 0.00422 diff --git a/results/v2.0/cuda-cray/K80.txt b/results/v2.0/cuda-cray/K80.txt new file mode 100644 index 0000000..bde088e --- /dev/null +++ b/results/v2.0/cuda-cray/K80.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K80 +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 179851.070 0.00299 0.00357 0.00303 +Mul 179720.423 0.00299 0.00358 0.00303 +Add 176265.359 0.00457 0.00539 0.00461 +Triad 176116.986 0.00457 0.00540 0.00461 diff --git a/results/v2.0/cuda-gnu/980ti.txt b/results/v2.0/cuda-gnu/980ti.txt new file mode 100644 index 0000000..40e1c7b --- /dev/null +++ b/results/v2.0/cuda-gnu/980ti.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device GeForce GTX 980 Ti +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 263497.383 0.00204 0.00204 0.00204 +Mul 263283.395 0.00204 0.00204 0.00204 +Add 269113.982 0.00299 0.00300 0.00300 +Triad 269153.828 0.00299 0.00300 0.00300 diff --git a/results/v2.0/cuda-gnu/K20X.txt b/results/v2.0/cuda-gnu/K20X.txt new file mode 100644 index 0000000..2749262 --- /dev/null +++ b/results/v2.0/cuda-gnu/K20X.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K20X +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 181833.763 0.00295 0.00298 0.00297 +Mul 181354.354 0.00296 0.00305 0.00297 +Add 179955.484 0.00448 0.00449 0.00448 +Triad 179798.066 0.00448 0.00450 0.00449 +Application 1396457 resources: utime ~3s, stime ~1s, Rss ~871996, inblocks ~690, outblocks ~1373 diff --git a/results/v2.0/cuda-x86/Haswell.txt b/results/v2.0/cuda-x86/Haswell.txt new file mode 100644 index 0000000..e352c56 --- /dev/null +++ b/results/v2.0/cuda-x86/Haswell.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device DEVICE EMULATION MODE +Driver: PGI +Function MBytes/sec Min (sec) Max Average +Copy 38778.163 0.01384 0.01391 0.01388 +Mul 38124.361 0.01408 0.01412 0.01410 +Add 41817.646 0.01926 0.01934 0.01930 +Triad 42446.352 0.01897 0.01906 0.01901 diff --git a/results/v2.0/cuda-x86/IvyBridge.txt b/results/v2.0/cuda-x86/IvyBridge.txt new file mode 100644 index 0000000..e0c0ffa --- /dev/null +++ b/results/v2.0/cuda-x86/IvyBridge.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device DEVICE EMULATION MODE +Driver: PGI +Function MBytes/sec Min (sec) Max Average +Copy 57308.251 0.00937 0.02134 0.01109 +Mul 55999.151 0.00959 0.02233 0.01134 +Add 63534.754 0.01268 0.02962 0.01492 +Triad 64546.130 0.01248 0.02873 0.01492 diff --git a/results/v2.0/cuda-x86/SandyBridge b/results/v2.0/cuda-x86/SandyBridge new file mode 100644 index 0000000..e7d77df --- /dev/null +++ b/results/v2.0/cuda-x86/SandyBridge @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: CUDA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device DEVICE EMULATION MODE +Driver: PGI +Function MBytes/sec Min (sec) Max Average +Copy 21626.429 0.02482 0.03784 0.02526 +Mul 21321.415 0.02518 0.02603 0.02551 +Add 23394.375 0.03442 0.03588 0.03506 +Triad 23527.878 0.03423 0.03550 0.03486 diff --git a/results/v2.0/gcc-power8/raja.txt b/results/v2.0/gcc-power8/raja.txt new file mode 100644 index 0000000..e00998e --- /dev/null +++ b/results/v2.0/gcc-power8/raja.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 227063.854 0.00236 0.01007 0.00449 +Mul 218404.455 0.00246 0.01080 0.00449 +Add 257927.160 0.00312 0.01815 0.00647 +Triad 253019.962 0.00318 0.01535 0.00635 diff --git a/results/v2.0/knl/kokkos-128.txt b/results/v2.0/knl/kokkos-128.txt new file mode 100644 index 0000000..d707a17 --- /dev/null +++ b/results/v2.0/knl/kokkos-128.txt @@ -0,0 +1,13 @@ + OMP_NUM_THREADS=128 numactl -m 1 ./gpu-stream-kokkos +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 284255.707 0.00189 0.00209 0.00199 +Mul 259925.621 0.00207 0.00483 0.00426 +Add 301882.418 0.00267 0.00295 0.00279 +Triad 293037.412 0.00275 0.00314 0.00293 diff --git a/results/v2.0/knl/mccalpin.txt b/results/v2.0/knl/mccalpin.txt new file mode 100644 index 0000000..3b7597f --- /dev/null +++ b/results/v2.0/knl/mccalpin.txt @@ -0,0 +1,33 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 64 +Number of Threads counted = 64 +------------------------------------------------------------- +Your clock granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 1507 microseconds. + (= 1507 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 387306.5 0.001418 0.001386 0.001456 +Scale: 414238.4 0.001321 0.001296 0.001368 +Add: 444668.2 0.001849 0.001811 0.001875 +Triad: 447436.7 0.001855 0.001800 0.001949 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- diff --git a/results/v2.0/knl/omp3.txt b/results/v2.0/knl/omp3.txt new file mode 100644 index 0000000..3d5e843 --- /dev/null +++ b/results/v2.0/knl/omp3.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 355432.548 0.00151 0.00159 0.00154 +Mul 303409.465 0.00177 0.00420 0.00214 +Add 317176.372 0.00254 0.00266 0.00259 +Triad 296841.725 0.00271 0.00307 0.00288 diff --git a/results/v2.0/knl/openacc-pgi.txt b/results/v2.0/knl/openacc-pgi.txt new file mode 100644 index 0000000..0749aca --- /dev/null +++ b/results/v2.0/knl/openacc-pgi.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenACC +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 323365.370 0.00166 0.00176 0.00172 +Mul 162148.544 0.00331 0.00590 0.00432 +Add 425349.852 0.00189 0.00203 0.00193 +Triad 239556.020 0.00336 0.00664 0.00549 diff --git a/results/v2.0/knl/raja.txt b/results/v2.0/knl/raja.txt new file mode 100644 index 0000000..d8a0b3e --- /dev/null +++ b/results/v2.0/knl/raja.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 358479.622 0.00150 0.00166 0.00157 +Mul 301367.207 0.00178 0.00369 0.00201 +Add 317005.071 0.00254 0.00271 0.00261 +Triad 298105.168 0.00270 0.00303 0.00286 diff --git a/results/v2.0/kokkos/980ti.txt b/results/v2.0/kokkos/980ti.txt new file mode 100644 index 0000000..da94519 --- /dev/null +++ b/results/v2.0/kokkos/980ti.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 263006.605 0.00204 0.00205 0.00204 +Mul 262996.298 0.00204 0.00205 0.00204 +Add 268536.157 0.00300 0.00301 0.00300 +Triad 268594.912 0.00300 0.00301 0.00300 diff --git a/results/v2.0/kokkos/K20X.txt b/results/v2.0/kokkos/K20X.txt new file mode 100644 index 0000000..afb6d00 --- /dev/null +++ b/results/v2.0/kokkos/K20X.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 182239.282 0.00295 0.00299 0.00296 +Mul 182179.668 0.00295 0.00298 0.00296 +Add 182333.793 0.00442 0.00451 0.00443 +Triad 182213.531 0.00442 0.00444 0.00443 diff --git a/results/v2.0/kokkos/K40.txt b/results/v2.0/kokkos/K40.txt new file mode 100644 index 0000000..cc59201 --- /dev/null +++ b/results/v2.0/kokkos/K40.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 192464.515 0.00279 0.00283 0.00281 +Mul 192505.853 0.00279 0.00283 0.00281 +Add 193303.390 0.00417 0.00419 0.00417 +Triad 193249.349 0.00417 0.00419 0.00417 diff --git a/results/v2.0/kokkos/K80.txt b/results/v2.0/kokkos/K80.txt new file mode 100644 index 0000000..cb10f57 --- /dev/null +++ b/results/v2.0/kokkos/K80.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 181880.210 0.00295 0.00352 0.00301 +Mul 181906.524 0.00295 0.00353 0.00301 +Add 179304.662 0.00449 0.00532 0.00456 +Triad 179172.535 0.00449 0.00531 0.00456 diff --git a/results/v2.0/kokkos/broadwell.txt b/results/v2.0/kokkos/broadwell.txt new file mode 100644 index 0000000..dfab2a2 --- /dev/null +++ b/results/v2.0/kokkos/broadwell.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 86500.072 0.00621 0.00627 0.00622 +Mul 84168.782 0.00638 0.00643 0.00640 +Add 94162.571 0.00855 0.00867 0.00858 +Triad 96282.261 0.00836 0.00843 0.00838 diff --git a/results/v2.0/kokkos/haswell.txt b/results/v2.0/kokkos/haswell.txt new file mode 100644 index 0000000..a109a1f --- /dev/null +++ b/results/v2.0/kokkos/haswell.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 79740.484 0.00673 0.00688 0.00678 +Mul 77939.661 0.00689 0.00701 0.00694 +Add 87154.341 0.00924 0.00954 0.00935 +Triad 88503.861 0.00910 0.00945 0.00919 diff --git a/results/v2.0/kokkos/ivybridge.txt b/results/v2.0/kokkos/ivybridge.txt new file mode 100644 index 0000000..1686a3b --- /dev/null +++ b/results/v2.0/kokkos/ivybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 56720.246 0.00947 0.01173 0.00999 +Mul 55256.554 0.00972 0.03029 0.01052 +Add 62562.358 0.01287 0.03304 0.01384 +Triad 62965.518 0.01279 0.02534 0.01364 diff --git a/results/v2.0/kokkos/sandybridge.txt b/results/v2.0/kokkos/sandybridge.txt new file mode 100644 index 0000000..8bbfbad --- /dev/null +++ b/results/v2.0/kokkos/sandybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 51337.144 0.01046 0.01126 0.01063 +Mul 51613.662 0.01040 0.01464 0.01064 +Add 54473.120 0.01478 0.01544 0.01506 +Triad 54461.048 0.01479 0.01568 0.01523 diff --git a/results/v2.0/ocl-gnu/980ti.txt b/results/v2.0/ocl-gnu/980ti.txt new file mode 100644 index 0000000..ac8c288 --- /dev/null +++ b/results/v2.0/ocl-gnu/980ti.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device GeForce GTX 980 Ti +Driver: 352.21 +Function MBytes/sec Min (sec) Max Average +Copy 262864.310 0.00204 0.00206 0.00205 +Mul 262886.578 0.00204 0.00205 0.00205 +Add 268781.736 0.00300 0.00300 0.00300 +Triad 268727.204 0.00300 0.00301 0.00300 diff --git a/results/v2.0/ocl-gnu/IvyBridge.txt b/results/v2.0/ocl-gnu/IvyBridge.txt new file mode 100644 index 0000000..b9b71a8 --- /dev/null +++ b/results/v2.0/ocl-gnu/IvyBridge.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz +Driver: 1.2.0.92 +Function MBytes/sec Min (sec) Max Average +Copy 47971.490 0.01119 0.02317 0.01306 +Mul 46385.194 0.01157 0.02247 0.01341 +Add 53319.761 0.01510 0.02831 0.01769 +Triad 53374.243 0.01509 0.02794 0.01707 diff --git a/results/v2.0/ocl-gnu/K20X.txt b/results/v2.0/ocl-gnu/K20X.txt new file mode 100644 index 0000000..1110793 --- /dev/null +++ b/results/v2.0/ocl-gnu/K20X.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K20X +Driver: 352.68 +Function MBytes/sec Min (sec) Max Average +Copy 182194.259 0.00295 0.00298 0.00296 +Mul 182081.736 0.00295 0.00296 0.00296 +Add 182723.055 0.00441 0.00443 0.00442 +Triad 182719.573 0.00441 0.00443 0.00442 +Application 1396458 resources: utime ~3s, stime ~1s, Rss ~1670780, inblocks ~2549, outblocks ~464 diff --git a/results/v2.0/ocl-gnu/K40.txt b/results/v2.0/ocl-gnu/K40.txt new file mode 100644 index 0000000..2788ac7 --- /dev/null +++ b/results/v2.0/ocl-gnu/K40.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K40m +Driver: 352.79 +Function MBytes/sec Min (sec) Max Average +Copy 190876.305 0.00281 0.00285 0.00283 +Mul 190558.963 0.00282 0.00284 0.00283 +Add 191437.004 0.00421 0.00422 0.00421 +Triad 191420.077 0.00421 0.00423 0.00421 diff --git a/results/v2.0/ocl-gnu/K80.txt b/results/v2.0/ocl-gnu/K80.txt new file mode 100644 index 0000000..8066b24 --- /dev/null +++ b/results/v2.0/ocl-gnu/K80.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Tesla K80 +Driver: 352.79 +Function MBytes/sec Min (sec) Max Average +Copy 181392.835 0.00296 0.00357 0.00303 +Mul 181350.127 0.00296 0.00356 0.00303 +Add 181786.662 0.00443 0.00532 0.00453 +Triad 181670.318 0.00443 0.00533 0.00454 diff --git a/results/v2.0/ocl-gnu/S9150.txt b/results/v2.0/ocl-gnu/S9150.txt new file mode 100644 index 0000000..1f619a7 --- /dev/null +++ b/results/v2.0/ocl-gnu/S9150.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Hawaii +Driver: 1912.5 (VM) +Function MBytes/sec Min (sec) Max Average +Copy 259805.874 0.00207 0.00214 0.00211 +Mul 260956.698 0.00206 0.00214 0.00211 +Add 268425.077 0.00300 0.00306 0.00304 +Triad 267506.939 0.00301 0.00307 0.00305 diff --git a/results/v2.0/ocl-gnu/SandyBridge.txt b/results/v2.0/ocl-gnu/SandyBridge.txt new file mode 100644 index 0000000..4c2428b --- /dev/null +++ b/results/v2.0/ocl-gnu/SandyBridge.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz +Driver: 1.2.0.8 +Function MBytes/sec Min (sec) Max Average +Copy 27240.466 0.01971 0.02092 0.02015 +Mul 26733.421 0.02008 0.02266 0.02043 +Add 29405.722 0.02739 0.03030 0.02808 +Triad 29734.601 0.02708 0.02990 0.02783 diff --git a/results/v2.0/ocl-gnu/broadwell.txt b/results/v2.0/ocl-gnu/broadwell.txt new file mode 100644 index 0000000..a620f0a --- /dev/null +++ b/results/v2.0/ocl-gnu/broadwell.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz +Driver: 1.2.0.57 +Function MBytes/sec Min (sec) Max Average +Copy 43567.760 0.01232 0.01257 0.01247 +Mul 42995.296 0.01249 0.01257 0.01253 +Add 48537.031 0.01659 0.01672 0.01664 +Triad 49238.925 0.01636 0.01652 0.01645 +Application 1407386 resources: utime ~130s, stime ~1s, Rss ~1647432, inblocks ~817, outblocks ~464 diff --git a/results/v2.0/ocl-gnu/fury.txt b/results/v2.0/ocl-gnu/fury.txt new file mode 100644 index 0000000..9e75ad5 --- /dev/null +++ b/results/v2.0/ocl-gnu/fury.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Fiji +Driver: 1912.5 (VM) +Function MBytes/sec Min (sec) Max Average +Copy 429587.115 0.00125 0.00141 0.00128 +Mul 429295.476 0.00125 0.00135 0.00128 +Add 442443.451 0.00182 0.00192 0.00186 +Triad 442069.177 0.00182 0.00194 0.00186 diff --git a/results/v2.0/ocl-gnu/haswell.txt b/results/v2.0/ocl-gnu/haswell.txt new file mode 100644 index 0000000..1ad91fb --- /dev/null +++ b/results/v2.0/ocl-gnu/haswell.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz +Driver: 1.2.0.57 +Function MBytes/sec Min (sec) Max Average +Copy 39316.996 0.01365 0.01413 0.01388 +Mul 39127.564 0.01372 0.01418 0.01392 +Add 43768.720 0.01840 0.01955 0.01871 +Triad 44121.647 0.01825 0.01892 0.01847 +Application 1407392 resources: utime ~106s, stime ~1s, Rss ~1642860, inblocks ~459, outblocks ~464 diff --git a/results/v2.0/ocl-gnu/knl.txt b/results/v2.0/ocl-gnu/knl.txt new file mode 100644 index 0000000..cf3e5a9 --- /dev/null +++ b/results/v2.0/ocl-gnu/knl.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using OpenCL device Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz +Driver: 1.2.0.10002 +Function MBytes/sec Min (sec) Max Average +Copy 218490.851 0.00246 0.00431 0.00257 +Mul 216827.480 0.00248 0.00276 0.00258 +Add 233472.011 0.00345 0.00365 0.00354 +Triad 236852.515 0.00340 0.00365 0.00351 diff --git a/results/v2.0/omp3-cray/broadwell.txt b/results/v2.0/omp3-cray/broadwell.txt new file mode 100644 index 0000000..2c31b89 --- /dev/null +++ b/results/v2.0/omp3-cray/broadwell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 88171.606 0.00609 0.00614 0.00611 +Mul 85751.031 0.00626 0.00632 0.00628 +Add 96048.112 0.00838 0.00845 0.00842 +Triad 98169.628 0.00820 0.00827 0.00824 +Application 1396470 resources: utime ~133s, stime ~3s, Rss ~1576044, inblocks ~6345, outblocks ~16023 diff --git a/results/v2.0/omp3-cray/haswell.txt b/results/v2.0/omp3-cray/haswell.txt new file mode 100644 index 0000000..2849610 --- /dev/null +++ b/results/v2.0/omp3-cray/haswell.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 81012.503 0.00663 0.00668 0.00665 +Mul 79344.699 0.00677 0.00683 0.00678 +Add 89615.218 0.00899 0.00907 0.00901 +Triad 90999.378 0.00885 0.00893 0.00887 +Application 1396725 resources: utime ~104s, stime ~2s, Rss ~1578772, inblocks ~544, outblocks ~213 diff --git a/results/v2.0/omp3-intel/IvyBridge.txt b/results/v2.0/omp3-intel/IvyBridge.txt new file mode 100644 index 0000000..67c112e --- /dev/null +++ b/results/v2.0/omp3-intel/IvyBridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 56676.385 0.00947 0.01567 0.01040 +Mul 55505.151 0.00967 0.01513 0.01075 +Add 61874.931 0.01302 0.01930 0.01435 +Triad 62073.488 0.01297 0.01899 0.01420 diff --git a/results/v2.0/omp3-intel/SandyBridge.txt b/results/v2.0/omp3-intel/SandyBridge.txt new file mode 100644 index 0000000..fd4b19b --- /dev/null +++ b/results/v2.0/omp3-intel/SandyBridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 54991.394 0.00976 0.01134 0.01002 +Mul 48003.655 0.01118 0.01986 0.01146 +Add 52111.180 0.01545 0.01621 0.01575 +Triad 52985.444 0.01520 0.01608 0.01557 diff --git a/results/v2.0/omp40-cray/K40.txt b/results/v2.0/omp40-cray/K40.txt new file mode 100644 index 0000000..31f1b33 --- /dev/null +++ b/results/v2.0/omp40-cray/K40.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenMP 4.0 +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 183090.545 0.00293 0.00295 0.00294 +Mul 182133.500 0.00295 0.00298 0.00295 +Add 180897.478 0.00445 0.00447 0.00446 +Triad 180637.056 0.00446 0.00447 0.00446 diff --git a/results/v2.0/omp40-cray/K80.txt b/results/v2.0/omp40-cray/K80.txt new file mode 100644 index 0000000..c69f76e --- /dev/null +++ b/results/v2.0/omp40-cray/K80.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenMP 4.0 +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 169214.022 0.00317 0.00437 0.00323 +Mul 168803.444 0.00318 0.00435 0.00323 +Add 167171.006 0.00482 0.00571 0.00486 +Triad 166943.598 0.00482 0.00710 0.00489 diff --git a/results/v2.0/omp45-clang/980ti.txt b/results/v2.0/omp45-clang/980ti.txt new file mode 100644 index 0000000..621a33d --- /dev/null +++ b/results/v2.0/omp45-clang/980ti.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenMP 4.5 +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 232637.036 0.00231 0.00715 0.00238 +Mul 227777.705 0.00236 0.00259 0.00247 +Add 246187.342 0.00327 0.00343 0.00332 +Triad 239670.377 0.00336 0.00362 0.00347 diff --git a/results/v2.0/omp45-cray/K20X.txt b/results/v2.0/omp45-cray/K20X.txt new file mode 100644 index 0000000..d640a14 --- /dev/null +++ b/results/v2.0/omp45-cray/K20X.txt @@ -0,0 +1,13 @@ +GPU-STREAM +Version: 2.0 +Implementation: OpenMP 4.5 +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 174380.925 0.00308 0.00310 0.00309 +Mul 174416.162 0.00308 0.00310 0.00309 +Add 175158.103 0.00460 0.00462 0.00461 +Triad 175104.249 0.00460 0.00462 0.00461 +Application 1396463 resources: utime ~2s, stime ~1s, Rss ~876708, inblocks ~600, outblocks ~327 diff --git a/results/v2.0/original-cray/broadwell.txt b/results/v2.0/original-cray/broadwell.txt new file mode 100644 index 0000000..5da860e --- /dev/null +++ b/results/v2.0/original-cray/broadwell.txt @@ -0,0 +1,34 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 44 +Number of Threads counted = 44 +------------------------------------------------------------- +Your clock granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 4125 microseconds. + (= 4125 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 92980.4 0.005803 0.005774 0.005843 +Scale: 97951.2 0.005527 0.005481 0.005586 +Add: 123058.1 0.006592 0.006544 0.006655 +Triad: 124799.5 0.006492 0.006453 0.006544 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- +Application 1396471 resources: utime ~111s, stime ~3s, Rss ~788804, inblocks ~5778, outblocks ~14259 diff --git a/results/v2.0/original-cray/haswell.txt b/results/v2.0/original-cray/haswell.txt new file mode 100644 index 0000000..c80746c --- /dev/null +++ b/results/v2.0/original-cray/haswell.txt @@ -0,0 +1,34 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 32 +Number of Threads counted = 32 +------------------------------------------------------------- +Your clock granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 4685 microseconds. + (= 4685 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 84400.3 0.006386 0.006361 0.006415 +Scale: 99272.6 0.005457 0.005408 0.005546 +Add: 118080.7 0.006854 0.006820 0.006892 +Triad: 116271.9 0.006969 0.006926 0.007042 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- +Application 1396734 resources: utime ~84s, stime ~2s, Rss ~791400, inblocks ~490, outblocks ~54 diff --git a/results/v2.0/original-icc/ivybridge.txt b/results/v2.0/original-icc/ivybridge.txt new file mode 100644 index 0000000..fa96905 --- /dev/null +++ b/results/v2.0/original-icc/ivybridge.txt @@ -0,0 +1,33 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 24 +Number of Threads counted = 24 +------------------------------------------------------------- +Your clock granularity/precision appears to be 2 microseconds. +Each test below will take on the order of 7369 microseconds. + (= 3684 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 57739.9 0.009647 0.009298 0.010455 +Scale: 74390.5 0.008626 0.007217 0.010923 +Add: 83859.7 0.010991 0.009603 0.013830 +Triad: 82738.1 0.011216 0.009733 0.015579 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- diff --git a/results/v2.0/original-icc/sandybridge.txt b/results/v2.0/original-icc/sandybridge.txt new file mode 100644 index 0000000..aefff1e --- /dev/null +++ b/results/v2.0/original-icc/sandybridge.txt @@ -0,0 +1,33 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 16 +Number of Threads counted = 16 +------------------------------------------------------------- +Your clock granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 6841 microseconds. + (= 6841 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 57290.4 0.009506 0.009371 0.009757 +Scale: 66890.4 0.008211 0.008026 0.008573 +Add: 65196.5 0.012791 0.012352 0.023453 +Triad: 64351.8 0.012753 0.012514 0.013659 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- diff --git a/results/v2.0/raja/980ti.txt b/results/v2.0/raja/980ti.txt new file mode 100644 index 0000000..96c311d --- /dev/null +++ b/results/v2.0/raja/980ti.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 262849.638 0.00204 0.00205 0.00204 +Mul 262842.175 0.00204 0.00205 0.00205 +Add 268802.910 0.00300 0.00300 0.00300 +Triad 268830.368 0.00300 0.00300 0.00300 diff --git a/results/v2.0/raja/K20X.txt b/results/v2.0/raja/K20X.txt new file mode 100644 index 0000000..0ab977c --- /dev/null +++ b/results/v2.0/raja/K20X.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 182375.663 0.00294 0.00296 0.00295 +Mul 182515.909 0.00294 0.00296 0.00295 +Add 181695.649 0.00443 0.00445 0.00444 +Triad 181436.686 0.00444 0.00445 0.00445 diff --git a/results/v2.0/raja/K40.txt b/results/v2.0/raja/K40.txt new file mode 100644 index 0000000..dbe3173 --- /dev/null +++ b/results/v2.0/raja/K40.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 191811.130 0.00280 0.00282 0.00281 +Mul 191720.029 0.00280 0.00282 0.00281 +Add 192768.490 0.00418 0.00419 0.00418 +Triad 192718.253 0.00418 0.00419 0.00418 diff --git a/results/v2.0/raja/K80.txt b/results/v2.0/raja/K80.txt new file mode 100644 index 0000000..7ef9889 --- /dev/null +++ b/results/v2.0/raja/K80.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 182711.634 0.00294 0.00354 0.00302 +Mul 182628.907 0.00294 0.00354 0.00302 +Add 178679.382 0.00451 0.00533 0.00462 +Triad 178467.177 0.00451 0.00534 0.00462 diff --git a/results/v2.0/raja/broadwell.txt b/results/v2.0/raja/broadwell.txt new file mode 100644 index 0000000..4fc8a7b --- /dev/null +++ b/results/v2.0/raja/broadwell.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 88094.593 0.00609 0.00619 0.00615 +Mul 85183.815 0.00630 0.00643 0.00633 +Add 95834.438 0.00840 0.00850 0.00846 +Triad 97943.551 0.00822 0.00836 0.00827 diff --git a/results/v2.0/raja/haswell.txt b/results/v2.0/raja/haswell.txt new file mode 100644 index 0000000..0e2be49 --- /dev/null +++ b/results/v2.0/raja/haswell.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 80888.406 0.00664 0.00671 0.00667 +Mul 79024.295 0.00679 0.00686 0.00682 +Add 89360.767 0.00901 0.00910 0.00905 +Triad 90744.543 0.00887 0.00893 0.00890 diff --git a/results/v2.0/raja/ivybridge.txt b/results/v2.0/raja/ivybridge.txt new file mode 100644 index 0000000..f22349b --- /dev/null +++ b/results/v2.0/raja/ivybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 55500.859 0.00967 0.01992 0.01139 +Mul 55232.718 0.00972 0.01978 0.01151 +Add 62568.702 0.01287 0.02616 0.01523 +Triad 64105.913 0.01256 0.02570 0.01497 diff --git a/results/v2.0/raja/sandybridge.txt b/results/v2.0/raja/sandybridge.txt new file mode 100644 index 0000000..e14d419 --- /dev/null +++ b/results/v2.0/raja/sandybridge.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 51576.355 0.01041 0.01123 0.01064 +Mul 50943.953 0.01054 0.01359 0.01074 +Add 53535.927 0.01504 0.01598 0.01535 +Triad 53928.576 0.01493 0.01579 0.01534 diff --git a/results/v2.0/sycl/IvyBridge.txt b/results/v2.0/sycl/IvyBridge.txt new file mode 100644 index 0000000..70285d5 --- /dev/null +++ b/results/v2.0/sycl/IvyBridge.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: SYCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using SYCL device Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz +Driver: 1.2.0.92 +Function MBytes/sec Min (sec) Max Average +Copy 39756.124 0.01350 0.02534 0.01567 +Mul 38899.994 0.01380 0.02237 0.01554 +Add 46878.810 0.01718 0.02802 0.01919 +Triad 51324.819 0.01569 0.02555 0.01748 diff --git a/results/v2.0/sycl/S9150.txt b/results/v2.0/sycl/S9150.txt new file mode 100644 index 0000000..9b342e0 --- /dev/null +++ b/results/v2.0/sycl/S9150.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: SYCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using SYCL device Hawaii +Driver: 1912.5 (VM) +Function MBytes/sec Min (sec) Max Average +Copy 222466.991 0.00241 0.00251 0.00245 +Mul 224827.470 0.00239 0.00247 0.00244 +Add 271092.068 0.00297 0.00305 0.00301 +Triad 269725.824 0.00299 0.00304 0.00302 diff --git a/results/v2.0/sycl/fury.txt b/results/v2.0/sycl/fury.txt new file mode 100644 index 0000000..2d1831b --- /dev/null +++ b/results/v2.0/sycl/fury.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: SYCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using SYCL device Fiji +Driver: 1912.5 (VM) +Function MBytes/sec Min (sec) Max Average +Copy 419830.223 0.00128 0.00141 0.00130 +Mul 419811.510 0.00128 0.00136 0.00130 +Add 432957.387 0.00186 0.00193 0.00188 +Triad 430761.906 0.00187 0.00194 0.00189 diff --git a/results/v2.0/sycl/knl.txt b/results/v2.0/sycl/knl.txt new file mode 100644 index 0000000..ccb699b --- /dev/null +++ b/results/v2.0/sycl/knl.txt @@ -0,0 +1,14 @@ +GPU-STREAM +Version: 2.0 +Implementation: SYCL +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Using SYCL device Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz +Driver: 1.2.0.10002 +Function MBytes/sec Min (sec) Max Average +Copy 188284.193 0.00285 0.00447 0.00298 +Mul 185567.824 0.00289 0.00762 0.00307 +Add 207104.230 0.00389 0.00614 0.00404 +Triad 207078.189 0.00389 0.01483 0.00415 diff --git a/results/v2.0/xl-power8/kokkos.txt b/results/v2.0/xl-power8/kokkos.txt new file mode 100644 index 0000000..96e368c --- /dev/null +++ b/results/v2.0/xl-power8/kokkos.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: KOKKOS +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 260801.154 0.00206 0.00955 0.00234 +Mul 231963.990 0.00231 0.01149 0.00264 +Add 292167.544 0.00276 0.01150 0.00309 +Triad 298266.810 0.00270 0.01533 0.00316 diff --git a/results/v2.0/xl-power8/mccalpin.txt b/results/v2.0/xl-power8/mccalpin.txt new file mode 100644 index 0000000..7623294 --- /dev/null +++ b/results/v2.0/xl-power8/mccalpin.txt @@ -0,0 +1,33 @@ +------------------------------------------------------------- +STREAM version $Revision: 5.10 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Array size = 33554432 (elements), Offset = 0 (elements) +Memory per array = 256.0 MiB (= 0.2 GiB). +Total memory required = 768.0 MiB (= 0.8 GiB). +Each kernel will be executed 100 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +------------------------------------------------------------- +Number of Threads requested = 20 +Number of Threads counted = 20 +------------------------------------------------------------- +Your clock granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 2250 microseconds. + (= 2250 clock ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 clock ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 261745.9 0.002101 0.002051 0.002517 +Scale: 253352.8 0.002188 0.002119 0.003140 +Add: 239468.3 0.003499 0.003363 0.004400 +Triad: 245151.7 0.003468 0.003285 0.004771 +------------------------------------------------------------- +Solution Validates: avg error less than 1.000000e-13 on all three arrays +------------------------------------------------------------- diff --git a/results/v2.0/xl-power8/omp3.txt b/results/v2.0/xl-power8/omp3.txt new file mode 100644 index 0000000..aa4a591 --- /dev/null +++ b/results/v2.0/xl-power8/omp3.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: Reference OpenMP +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 223266.147 0.00240 0.00271 0.00248 +Mul 196756.047 0.00273 0.00372 0.00297 +Add 210090.244 0.00383 0.00441 0.00396 +Triad 212958.097 0.00378 0.00500 0.00409 diff --git a/results/v2.0/xl-power8/raja.txt b/results/v2.0/xl-power8/raja.txt new file mode 100644 index 0000000..3cb5efd --- /dev/null +++ b/results/v2.0/xl-power8/raja.txt @@ -0,0 +1,12 @@ +GPU-STREAM +Version: 2.0 +Implementation: RAJA +Running kernels 100 times +Precision: double +Array size: 268.4 MB (=0.3 GB) +Total size: 805.3 MB (=0.8 GB) +Function MBytes/sec Min (sec) Max Average +Copy 257796.702 0.00208 0.01017 0.00423 +Mul 218399.746 0.00246 0.01163 0.00440 +Add 269553.023 0.00299 0.01575 0.00620 +Triad 279022.596 0.00289 0.01569 0.00614