diff --git a/OMPStream.cpp b/OMPStream.cpp index c72ed56..e10c8c1 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -58,7 +58,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -68,6 +68,11 @@ void OMPStream::init_arrays(T initA, T initB, T initC) b[i] = initB; c[i] = initC; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -97,7 +102,7 @@ void OMPStream::copy() unsigned int array_size = this->array_size; T *a = this->a; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -105,6 +110,11 @@ void OMPStream::copy() { c[i] = a[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -116,7 +126,7 @@ void OMPStream::mul() unsigned int array_size = this->array_size; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -124,6 +134,11 @@ void OMPStream::mul() { b[i] = scalar * c[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(c[0:0]) + #endif } template @@ -134,7 +149,7 @@ void OMPStream::add() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -142,6 +157,11 @@ void OMPStream::add() { c[i] = a[i] + b[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template @@ -154,7 +174,7 @@ void OMPStream::triad() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif @@ -162,6 +182,11 @@ void OMPStream::triad() { a[i] = b[i] + scalar * c[i]; } + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + // If using the Cray compiler, the kernels do not block, so this update forces + // a small copy to ensure blocking so that timing is correct + #pragma omp target update from(a[0:0]) + #endif } template