From 73f122089d96485e351da02e20550d8c4ba55861 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 7 Feb 2018 15:05:06 +0000
Subject: [PATCH] [OpenMP 4.5] Remove superfluous map clauses

Fixes #41.
OpenMP 4.5 does not require the map clauses on the target region
if the data has been previously defined using unstructured data
enter/exit clauses. Removing this clauses works fine with the
Clang compiler, however we noticed issues with the Cray compiler.
The issue is that the Cray compiler does not block the target
region "kernel calls" and so the timing is incorrect. This was
not noticed before due to the presence of the map clauses.
For now, we have had to add an update from clause of a scalar
value to ensure that the kenel blocks. It is hoped that we can
remove this in due course. But in the vein of showing how the
models work we want to keep the minimum required correct code
(which is without the map clause) but need the code to also
work correctly.
---
 OMPStream.cpp | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/OMPStream.cpp b/OMPStream.cpp
index c72ed56..e10c8c1 100644
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@@ -58,7 +58,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -68,6 +68,11 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
     b[i] = initB;
     c[i] = initC;
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -97,7 +102,7 @@ void OMPStream<T>::copy()
   unsigned int array_size = this->array_size;
   T *a = this->a;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -105,6 +110,11 @@ void OMPStream<T>::copy()
   {
     c[i] = a[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -116,7 +126,7 @@ void OMPStream<T>::mul()
   unsigned int array_size = this->array_size;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -124,6 +134,11 @@ void OMPStream<T>::mul()
   {
     b[i] = scalar * c[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(c[0:0])
+  #endif
 }
 
 template <class T>
@@ -134,7 +149,7 @@ void OMPStream<T>::add()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -142,6 +157,11 @@ void OMPStream<T>::add()
   {
     c[i] = a[i] + b[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -154,7 +174,7 @@ void OMPStream<T>::triad()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -162,6 +182,11 @@ void OMPStream<T>::triad()
   {
     a[i] = b[i] + scalar * c[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>