From 73f122089d96485e351da02e20550d8c4ba55861 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 7 Feb 2018 15:05:06 +0000 Subject: [PATCH] [OpenMP 4.5] Remove superfluous map clauses Fixes #41. OpenMP 4.5 does not require the map clauses on the target region if the data has been previously defined using unstructured data enter/exit clauses. Removing this clauses works fine with the Clang compiler, however we noticed issues with the Cray compiler. The issue is that the Cray compiler does not block the target region "kernel calls" and so the timing is incorrect. This was not noticed before due to the presence of the map clauses. For now, we have had to add an update from clause of a scalar value to ensure that the kenel blocks. It is hoped that we can remove this in due course. But in the vein of showing how the models work we want to keep the minimum required correct code (which is without the map clause) but need the code to also work correctly. --- OMPStream.cpp | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/OMPStream.cpp b/OMPStream.cpp index c72ed56..e10c8c1 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -58,7 +58,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -68,6 +68,11 @@ void OMPStream::init_arrays(T initA, T initB, T initC) b[i] = initB; c[i] = initC; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -97,7 +102,7 @@ void OMPStream::copy() unsigned int array_size = this->array_size; T *a = this->a; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -105,6 +110,11 @@ void OMPStream::copy() { c[i] = a[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -116,7 +126,7 @@ void OMPStream::mul() unsigned int array_size = this->array_size; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -124,6 +134,11 @@ void OMPStream::mul() { b[i] = scalar * c[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(c[0:0]) + #endif } template @@ -134,7 +149,7 @@ void OMPStream::add() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -142,6 +157,11 @@ void OMPStream::add() { c[i] = a[i] + b[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -154,7 +174,7 @@ void OMPStream::triad() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -162,6 +182,11 @@ void OMPStream::triad() { a[i] = b[i] + scalar * c[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template