[OpenMP 4.5] Remove superfluous map clauses

Fixes #41.
OpenMP 4.5 does not require the map clauses on the target region
if the data has been previously defined using unstructured data
enter/exit clauses. Removing this clauses works fine with the
Clang compiler, however we noticed issues with the Cray compiler.
The issue is that the Cray compiler does not block the target
region "kernel calls" and so the timing is incorrect. This was
not noticed before due to the presence of the map clauses.
For now, we have had to add an update from clause of a scalar
value to ensure that the kenel blocks. It is hoped that we can
remove this in due course. But in the vein of showing how the
models work we want to keep the minimum required correct code
(which is without the map clause) but need the code to also
work correctly.
This commit is contained in:
Tom Deakin 2018-02-07 15:05:06 +00:00
parent cfb89018f6
commit 73f122089d

View File

@ -58,7 +58,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
@ -68,6 +68,11 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
b[i] = initB;
c[i] = initC;
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T>
@ -97,7 +102,7 @@ void OMPStream<T>::copy()
unsigned int array_size = this->array_size;
T *a = this->a;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
@ -105,6 +110,11 @@ void OMPStream<T>::copy()
{
c[i] = a[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T>
@ -116,7 +126,7 @@ void OMPStream<T>::mul()
unsigned int array_size = this->array_size;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
@ -124,6 +134,11 @@ void OMPStream<T>::mul()
{
b[i] = scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(c[0:0])
#endif
}
template <class T>
@ -134,7 +149,7 @@ void OMPStream<T>::add()
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
@ -142,6 +157,11 @@ void OMPStream<T>::add()
{
c[i] = a[i] + b[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T>
@ -154,7 +174,7 @@ void OMPStream<T>::triad()
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
#endif
@ -162,6 +182,11 @@ void OMPStream<T>::triad()
{
a[i] = b[i] + scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
#endif
}
template <class T>