From 2234841b16d897eb5ef0eb03e2746ebe9b62be48 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:08:59 +0100 Subject: [PATCH 001/117] Initial commit of new design with classes --- src/CUDAStream.cu | 29 +++++++++++++++++++++++++++++ src/CUDAStream.h | 13 +++++++++++++ src/Stream.h | 23 +++++++++++++++++++++++ src/common.h | 2 ++ src/main.cpp | 22 ++++++++++++++++++++++ 5 files changed, 89 insertions(+) create mode 100644 src/CUDAStream.cu create mode 100644 src/CUDAStream.h create mode 100644 src/Stream.h create mode 100644 src/common.h create mode 100644 src/main.cpp diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu new file mode 100644 index 0000000..d95f9a1 --- /dev/null +++ b/src/CUDAStream.cu @@ -0,0 +1,29 @@ + +#include "CUDAStream.h" + +template +void CUDAStream::copy() +{ + return; +} + +template +void CUDAStream::mul() +{ + return; +} + +template +void CUDAStream::add() +{ + return; +} + +template +void CUDAStream::triad() +{ + return; +} + +template class CUDAStream; +template class CUDAStream; diff --git a/src/CUDAStream.h b/src/CUDAStream.h new file mode 100644 index 0000000..bfea7ed --- /dev/null +++ b/src/CUDAStream.h @@ -0,0 +1,13 @@ + + +#include "Stream.h" + +template +class CUDAStream : public Stream +{ + public: + void copy(); + void add(); + void mul(); + void triad(); +}; diff --git a/src/Stream.h b/src/Stream.h new file mode 100644 index 0000000..e283437 --- /dev/null +++ b/src/Stream.h @@ -0,0 +1,23 @@ + +#pragma once + +#include + +template +class Stream +{ + public: + // Kernels + // These must be blocking calls + virtual void copy() = 0; + virtual void mul() = 0; + virtual void add() = 0; + virtual void triad() = 0; + + + // Implementation specific device functions + static std::vector getDeviceList(); + static std::vector getDeviceName(); + static std::vector getDeviceDriver(); + +}; diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..02d43a2 --- /dev/null +++ b/src/common.h @@ -0,0 +1,2 @@ + +#define VERSION_STRING "2.0" diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..9e22b89 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,22 @@ + +#include + +#include "common.h" +#include "Stream.h" +#include "CUDAStream.h" + + + +int main(int argc, char *argv[]) +{ + std::cout + << "GPU-STREAM" << std::endl + << "Version:" << VERSION_STRING << std::endl + << "Implementation: " << std::endl; + + Stream *stream; + stream = new CUDAStream(); + + delete[] stream; + +} From 1a259d4fc8e43ba67767951fa5438f1c173d6f36 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:24:04 +0100 Subject: [PATCH 002/117] Add a copy kernel --- src/CUDAStream.cu | 9 ++++++++- src/CUDAStream.h | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index d95f9a1..163a3f1 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -1,10 +1,17 @@ #include "CUDAStream.h" +template +__global__ void copy_kernel(const T * a, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i]; +} + template void CUDAStream::copy() { - return; + copy<<<1024, 1024>>>(a, c); } template diff --git a/src/CUDAStream.h b/src/CUDAStream.h index bfea7ed..ab9ac38 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -10,4 +10,11 @@ class CUDAStream : public Stream void add(); void mul(); void triad(); + + private: + // Device side pointers to arrays + T *d_a; + T *d_b; + T *d_c; + }; From 0bf68f99093d206689d00f667bc96b7010e4bc09 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:34:25 +0100 Subject: [PATCH 003/117] Make a copy kernel using the private variables --- src/CUDAStream.cu | 2 +- src/CUDAStream.h | 13 +++++++------ src/main.cpp | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 163a3f1..9b1b36d 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -11,7 +11,7 @@ __global__ void copy_kernel(const T * a, T * c) template void CUDAStream::copy() { - copy<<<1024, 1024>>>(a, c); + copy_kernel<<<1024, 1024>>>(d_a, d_c); } template diff --git a/src/CUDAStream.h b/src/CUDAStream.h index ab9ac38..9348b29 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -5,16 +5,17 @@ template class CUDAStream : public Stream { - public: - void copy(); - void add(); - void mul(); - void triad(); - private: // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + + public: + void copy(); + void add(); + void mul(); + void triad(); + }; diff --git a/src/main.cpp b/src/main.cpp index 9e22b89..76c3da4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,6 +16,7 @@ int main(int argc, char *argv[]) Stream *stream; stream = new CUDAStream(); + stream->copy(); delete[] stream; From 6169bdb7b5f887ba3ab8837f153bc00c272991b6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:40:49 +0100 Subject: [PATCH 004/117] Add some global variables --- src/main.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 76c3da4..6702e11 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,18 +1,28 @@ #include +#include #include "common.h" #include "Stream.h" #include "CUDAStream.h" +const unsigned int ARRAY_SIZE = 52428800; + +#define IMPLEMENTATION_STRING "CUDA" int main(int argc, char *argv[]) { std::cout << "GPU-STREAM" << std::endl << "Version:" << VERSION_STRING << std::endl - << "Implementation: " << std::endl; + << "Implementation: " << IMPLEMENTATION_STRING << std::endl; + + + // Create host vectors + std::vector a(ARRAY_SIZE, 1.0); + std::vector b(ARRAY_SIZE, 2.0); + std::vector c(ARRAY_SIZE, 0.0); Stream *stream; stream = new CUDAStream(); From 03b01e190f2d3d174870ef5704b18e2f8e81bf49 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:49:04 +0100 Subject: [PATCH 005/117] Add cuda constructor declaration and error checking function --- src/CUDAStream.cu | 18 ++++++++++++++++++ src/CUDAStream.h | 4 ++++ src/main.cpp | 5 ++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 9b1b36d..4305ceb 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -1,6 +1,24 @@ #include "CUDAStream.h" +void check_error(void) +{ + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Error: " << cudaGetErrorString(err) << std::endl; + exit(err); + } +} + +template +CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) +{ + // Create device buffers + cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + +} + template __global__ void copy_kernel(const T * a, T * c) { diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 9348b29..49e76df 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -1,4 +1,5 @@ +#include #include "Stream.h" @@ -13,6 +14,9 @@ class CUDAStream : public Stream public: + + CUDAStream(const unsigned int); + void copy(); void add(); void mul(); diff --git a/src/main.cpp b/src/main.cpp index 6702e11..a293eb7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,7 +25,10 @@ int main(int argc, char *argv[]) std::vector c(ARRAY_SIZE, 0.0); Stream *stream; - stream = new CUDAStream(); + + // Use the CUDA implementation + stream = new CUDAStream(ARRAY_SIZE); + stream->copy(); delete[] stream; From ee4820b5e4763573211146a049ed392fb5390380 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:50:22 +0100 Subject: [PATCH 006/117] Create CUDA device buffers --- src/CUDAStream.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 4305ceb..66304d1 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -16,7 +16,11 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) { // Create device buffers cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); - + check_error(); + cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + check_error(); + cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + check_error(); } template From ae679a57752ee1155a7093a7be4740758a9dc6d6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 14:50:58 +0100 Subject: [PATCH 007/117] Fix indentation in Stream.h --- src/Stream.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Stream.h b/src/Stream.h index e283437..3a3e826 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -7,17 +7,18 @@ template class Stream { public: - // Kernels - // These must be blocking calls - virtual void copy() = 0; - virtual void mul() = 0; - virtual void add() = 0; - virtual void triad() = 0; + // Kernels + // These must be blocking calls + virtual void copy() = 0; + virtual void mul() = 0; + virtual void add() = 0; + virtual void triad() = 0; - // Implementation specific device functions - static std::vector getDeviceList(); - static std::vector getDeviceName(); - static std::vector getDeviceDriver(); + + // Implementation specific device functions + static std::vector getDeviceList(); + static std::vector getDeviceName(); + static std::vector getDeviceDriver(); }; From 8e534daf8bf72c3a1f9c775b8464f1bdf8a0adf7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 15:02:41 +0100 Subject: [PATCH 008/117] Add methods to copy data between host and device --- src/CUDAStream.cu | 20 ++++++++++++++++++++ src/CUDAStream.h | 4 ++++ src/Stream.h | 4 +++- src/main.cpp | 2 ++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 66304d1..f325dfc 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -23,6 +23,23 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) check_error(); } + +template +void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + // Copy host memory to device + cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); + cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); + cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); +} + +template +void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ +} template __global__ void copy_kernel(const T * a, T * c) { @@ -34,6 +51,9 @@ template void CUDAStream::copy() { copy_kernel<<<1024, 1024>>>(d_a, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); } template diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 49e76df..881811e 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -22,4 +22,8 @@ class CUDAStream : public Stream void mul(); void triad(); + void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c); + void read_arrays(std::vector& a, std::vector& b, std::vector& c); + }; + diff --git a/src/Stream.h b/src/Stream.h index 3a3e826..bb83d69 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -14,7 +14,9 @@ class Stream virtual void add() = 0; virtual void triad() = 0; - + // Copy memory between host and device + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; // Implementation specific device functions static std::vector getDeviceList(); diff --git a/src/main.cpp b/src/main.cpp index a293eb7..cfb8884 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -29,6 +29,8 @@ int main(int argc, char *argv[]) // Use the CUDA implementation stream = new CUDAStream(ARRAY_SIZE); + stream->write_arrays(a, b, c); + stream->copy(); delete[] stream; From c22b74ba47835656bc3fab309938ad02f46984eb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 15:30:37 +0100 Subject: [PATCH 009/117] Add read_arrays definition for CUDA --- src/CUDAStream.cu | 9 +++++++++ src/main.cpp | 3 +++ 2 files changed, 12 insertions(+) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index f325dfc..5629b40 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -39,7 +39,16 @@ void CUDAStream::write_arrays(const std::vector& a, const std::vector& template void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + // Copy device memory to host + cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); } + + template __global__ void copy_kernel(const T * a, T * c) { diff --git a/src/main.cpp b/src/main.cpp index cfb8884..6c2c0d8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,6 +33,9 @@ int main(int argc, char *argv[]) stream->copy(); + stream->read_arrays(a, b, c); + std::cout << c[105] << std::endl; + delete[] stream; } From dec0237353617bb6957bcb8b4a25c6587935354c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 16:03:28 +0100 Subject: [PATCH 010/117] Add mul kernel --- src/CUDAStream.cu | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 5629b40..15b1b41 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -65,10 +65,21 @@ void CUDAStream::copy() check_error(); } +template +__global__ void mul_kernel(T * b, const T * c) +{ + const T scalar = 3.0; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + b[i] = scalar * c[i]; +} + template void CUDAStream::mul() { - return; + mul_kernel<<<1024, 1024>>>(d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); } template @@ -85,3 +96,4 @@ void CUDAStream::triad() template class CUDAStream; template class CUDAStream; + From 7a3a546a6e208743cedd148429430e2e9337fd32 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 16:06:17 +0100 Subject: [PATCH 011/117] Add mul CUDA kernel --- src/CUDAStream.cu | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 15b1b41..f08c40b 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -82,10 +82,20 @@ void CUDAStream::mul() check_error(); } +template +__global__ void add_kernel(const T * a, const T * b, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i] + b[i]; +} + template void CUDAStream::add() { - return; + add_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); } template From 319e11011c896a953301b273bfdbb3bb37920d1e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 16:07:32 +0100 Subject: [PATCH 012/117] Add triad kernel --- src/CUDAStream.cu | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index f08c40b..18e1a70 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -98,10 +98,21 @@ void CUDAStream::add() check_error(); } +template +__global__ void triad_kernel(T * a, const T * b, const T * c) +{ + const T scalar = 3.0; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = b[i] + scalar * c[i]; +} + template void CUDAStream::triad() { - return; + triad_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); } template class CUDAStream; From 9c673317a7de31473ac4e4e4b720a2610ccfd4fb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 26 Apr 2016 16:09:51 +0100 Subject: [PATCH 013/117] Store array size in class so can use it for kernel launches --- src/CUDAStream.cu | 10 ++++++---- src/CUDAStream.h | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 18e1a70..caf5e1a 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -14,6 +14,8 @@ void check_error(void) template CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) { + array_size = ARRAY_SIZE; + // Create device buffers cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); check_error(); @@ -59,7 +61,7 @@ __global__ void copy_kernel(const T * a, T * c) template void CUDAStream::copy() { - copy_kernel<<<1024, 1024>>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -76,7 +78,7 @@ __global__ void mul_kernel(T * b, const T * c) template void CUDAStream::mul() { - mul_kernel<<<1024, 1024>>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -92,7 +94,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c) template void CUDAStream::add() { - add_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -109,7 +111,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c) template void CUDAStream::triad() { - triad_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 881811e..14f2cc2 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -7,6 +7,8 @@ template class CUDAStream : public Stream { private: + // Size of arrays + unsigned int array_size; // Device side pointers to arrays T *d_a; T *d_b; From 9730cd071e11f1149f81f30444f7600d8991fdbe Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 11:34:42 +0100 Subject: [PATCH 014/117] Overridden functions should have more keywords --- src/CUDAStream.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 14f2cc2..451cfc1 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -6,7 +6,7 @@ template class CUDAStream : public Stream { - private: + protected: // Size of arrays unsigned int array_size; // Device side pointers to arrays @@ -19,10 +19,10 @@ class CUDAStream : public Stream CUDAStream(const unsigned int); - void copy(); - void add(); - void mul(); - void triad(); + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c); void read_arrays(std::vector& a, std::vector& b, std::vector& c); From 6522d9114a14edb9c26c0d571a430c4fb2d63db8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 11:35:04 +0100 Subject: [PATCH 015/117] Add new line at end of file --- src/Stream.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Stream.h b/src/Stream.h index bb83d69..ecf043b 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -24,3 +24,4 @@ class Stream static std::vector getDeviceDriver(); }; + From 6225ae90a7007914066087bb170cc9fc5f12614d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 11:35:12 +0100 Subject: [PATCH 016/117] Add start of check results function --- src/main.cpp | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 6c2c0d8..355fac7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,6 +1,8 @@ #include #include +#include +#include #include "common.h" #include "Stream.h" @@ -11,6 +13,9 @@ const unsigned int ARRAY_SIZE = 52428800; #define IMPLEMENTATION_STRING "CUDA" +template +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); + int main(int argc, char *argv[]) { std::cout @@ -32,10 +37,41 @@ int main(int argc, char *argv[]) stream->write_arrays(a, b, c); stream->copy(); + stream->mul(); + stream->add(); + stream->triad(); stream->read_arrays(a, b, c); - std::cout << c[105] << std::endl; + std::cout << a[105] << std::endl; + + check_solution(1, a, b, c); delete[] stream; } + +template +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) +{ + // Generate correct solution + T goldA = 1.0; + T goldB = 2.0; + T goldC = 0.0; + + const T scalar = 3.0; + + for (unsigned int i = 0; i < ntimes; i++) + { + // Do STREAM! + goldC = goldA; + goldB = scalar * goldC; + goldC = goldA + goldB; + goldA = goldB + scalar * goldC; + } + + // Calculate the average error + double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + + +} + From 9aa27cd91dae5e93d34b89e8e3ca8e4cd8a8a6ce Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 11:42:23 +0100 Subject: [PATCH 017/117] Print out average error on check if there is an error --- src/main.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 355fac7..d45a4a5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "common.h" #include "Stream.h" @@ -71,7 +72,26 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector // Calculate the average error double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); - + errA /= a.size(); + double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + errB /= b.size(); + double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + errC /= c.size(); + + double epsi = std::numeric_limits::epsilon() * 100.0; + + if (errA > epsi) + std::cerr + << "Validation failed on a[]. Average error " << errA + << std::endl; + if (errB > epsi) + std::cerr + << "Validation failed on b[]. Average error " << errB + << std::endl; + if (errC > epsi) + std::cerr + << "Validation failed on c[]. Average error " << errC + << std::endl; } From 40c787d0409d5f771cabb29105738752e8f49e2d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 11:52:15 +0100 Subject: [PATCH 018/117] Check bufers fit on CUDA device --- src/CUDAStream.cu | 6 ++++++ src/CUDAStream.h | 1 + 2 files changed, 7 insertions(+) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index caf5e1a..5ec1e67 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -16,6 +16,12 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) { array_size = ARRAY_SIZE; + // Check buffers fit on the device + cudaDeviceProp props; + cudaGetDeviceProperties(&props, 0); + if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + // Create device buffers cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); check_error(); diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 451cfc1..bde574e 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -1,5 +1,6 @@ #include +#include #include "Stream.h" From c28e70ae707abb5a3f22a8d1011c8e39caa28913 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 12:08:49 +0100 Subject: [PATCH 019/117] Add timers and run multiple times --- src/main.cpp | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d45a4a5..7dab5c7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "common.h" #include "Stream.h" @@ -11,6 +12,7 @@ const unsigned int ARRAY_SIZE = 52428800; +const unsigned int ntimes = 10; #define IMPLEMENTATION_STRING "CUDA" @@ -37,15 +39,44 @@ int main(int argc, char *argv[]) stream->write_arrays(a, b, c); - stream->copy(); - stream->mul(); - stream->add(); - stream->triad(); + // List of times + std::vector< std::vector > timings; + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Main loop + for (unsigned int k = 0; k < ntimes; k++) + { + std::vector times; + + t1 = std::chrono::high_resolution_clock::now(); + stream->copy(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + t1 = std::chrono::high_resolution_clock::now(); + stream->mul(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + t1 = std::chrono::high_resolution_clock::now(); + stream->add(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + t1 = std::chrono::high_resolution_clock::now(); + stream->triad(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + timings.push_back(times); + + } + + // Check solutions stream->read_arrays(a, b, c); - std::cout << a[105] << std::endl; - - check_solution(1, a, b, c); + check_solution(ntimes, a, b, c); delete[] stream; From 3d5a49317e2ca4107bb91369d9abf5d773701134 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 12:11:19 +0100 Subject: [PATCH 020/117] Free CUDA buffers in destructor --- src/CUDAStream.cu | 11 +++++++++++ src/CUDAStream.h | 1 + 2 files changed, 12 insertions(+) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 5ec1e67..7c9afb3 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -32,6 +32,17 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) } +template +CUDAStream::~CUDAStream() +{ + cudaFree(d_a); + check_error(); + cudaFree(d_b); + check_error(); + cudaFree(d_c); + check_error(); +} + template void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { diff --git a/src/CUDAStream.h b/src/CUDAStream.h index bde574e..89ad536 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -19,6 +19,7 @@ class CUDAStream : public Stream public: CUDAStream(const unsigned int); + ~CUDAStream(); virtual void copy() override; virtual void add() override; From daa7f643b96a010cf8bc4dc63e2176205090ae78 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 27 Apr 2016 13:18:06 +0100 Subject: [PATCH 021/117] Print out timing results --- src/main.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7dab5c7..50bc51f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "common.h" #include "Stream.h" @@ -40,7 +42,10 @@ int main(int argc, char *argv[]) stream->write_arrays(a, b, c); // List of times - std::vector< std::vector > timings; + std::vector copy_timings; + std::vector mul_timings; + std::vector add_timings; + std::vector triad_timings; // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -48,29 +53,30 @@ int main(int argc, char *argv[]) // Main loop for (unsigned int k = 0; k < ntimes; k++) { - std::vector times; + // Execute Copy t1 = std::chrono::high_resolution_clock::now(); stream->copy(); t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + copy_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Mul t1 = std::chrono::high_resolution_clock::now(); stream->mul(); t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + mul_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Add t1 = std::chrono::high_resolution_clock::now(); stream->add(); t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + add_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Triad t1 = std::chrono::high_resolution_clock::now(); stream->triad(); t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); + triad_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -78,6 +84,64 @@ int main(int argc, char *argv[]) stream->read_arrays(a, b, c); check_solution(ntimes, a, b, c); + // Crunch timing results + + // Get min/max; ignore first result + auto copy_minmax = std::minmax_element(copy_timings.begin()+1, copy_timings.end()); + auto mul_minmax = std::minmax_element(mul_timings.begin()+1, mul_timings.end()); + auto add_minmax = std::minmax_element(add_timings.begin()+1, add_timings.end()); + auto triad_minmax = std::minmax_element(triad_timings.begin()+1, triad_timings.end()); + + double copy_average = std::accumulate(copy_timings.begin()+1, copy_timings.end(), 0.0) / (double)(ntimes - 1); + double mul_average = std::accumulate(mul_timings.begin()+1, mul_timings.end(), 0.0) / (double)(ntimes - 1); + double add_average = std::accumulate(add_timings.begin()+1, add_timings.end(), 0.0) / (double)(ntimes - 1); + double triad_average = std::accumulate(triad_timings.begin()+1, triad_timings.end(), 0.0) / (double)(ntimes - 1); + + + // Display results + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << "MBytes/sec" + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" << std::endl; + + std::cout << std::fixed; + + std::cout + << std::left << std::setw(12) << "Copy" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*copy_minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.second + << std::left << std::setw(12) << std::setprecision(5) << copy_average + << std::endl; + + + std::cout + << std::left << std::setw(12) << "Mul" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*mul_minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.second + << std::left << std::setw(12) << std::setprecision(5) << mul_average + << std::endl; + + + std::cout + << std::left << std::setw(12) << "Add" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*add_minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.second + << std::left << std::setw(12) << std::setprecision(5) << add_average + << std::endl; + + std::cout + << std::left << std::setw(12) << "Triad" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*triad_minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.second + << std::left << std::setw(12) << std::setprecision(5) << triad_average + << std::endl; + delete[] stream; } From 377b3487482f368f0d38199e2a3ca071cf60b690 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 11:15:25 +0100 Subject: [PATCH 022/117] Move implementation string to the common header file --- src/common.h | 2 ++ src/main.cpp | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common.h b/src/common.h index 02d43a2..c4bdadb 100644 --- a/src/common.h +++ b/src/common.h @@ -1,2 +1,4 @@ #define VERSION_STRING "2.0" +#define IMPLEMENTATION_STRING "CUDA" + diff --git a/src/main.cpp b/src/main.cpp index 50bc51f..71cb478 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,6 @@ const unsigned int ARRAY_SIZE = 52428800; const unsigned int ntimes = 10; -#define IMPLEMENTATION_STRING "CUDA" template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); @@ -25,7 +24,7 @@ int main(int argc, char *argv[]) { std::cout << "GPU-STREAM" << std::endl - << "Version:" << VERSION_STRING << std::endl + << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl; From 8d88afdedb52fd9a19c4a76ff6eae6fead5f0e24 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 11:57:09 +0100 Subject: [PATCH 023/117] Tidy up timing printing to reduce code duplication --- src/main.cpp | 82 +++++++++++++++++++--------------------------------- 1 file changed, 29 insertions(+), 53 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 71cb478..42dbe28 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -41,10 +41,7 @@ int main(int argc, char *argv[]) stream->write_arrays(a, b, c); // List of times - std::vector copy_timings; - std::vector mul_timings; - std::vector add_timings; - std::vector triad_timings; + std::vector> timings(4); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -52,30 +49,29 @@ int main(int argc, char *argv[]) // Main loop for (unsigned int k = 0; k < ntimes; k++) { - // Execute Copy t1 = std::chrono::high_resolution_clock::now(); stream->copy(); t2 = std::chrono::high_resolution_clock::now(); - copy_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Mul t1 = std::chrono::high_resolution_clock::now(); stream->mul(); t2 = std::chrono::high_resolution_clock::now(); - mul_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[1].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Add t1 = std::chrono::high_resolution_clock::now(); stream->add(); t2 = std::chrono::high_resolution_clock::now(); - add_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[2].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Triad t1 = std::chrono::high_resolution_clock::now(); stream->triad(); t2 = std::chrono::high_resolution_clock::now(); - triad_timings.push_back(std::chrono::duration_cast >(t2 - t1).count()); + timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -83,21 +79,7 @@ int main(int argc, char *argv[]) stream->read_arrays(a, b, c); check_solution(ntimes, a, b, c); - // Crunch timing results - - // Get min/max; ignore first result - auto copy_minmax = std::minmax_element(copy_timings.begin()+1, copy_timings.end()); - auto mul_minmax = std::minmax_element(mul_timings.begin()+1, mul_timings.end()); - auto add_minmax = std::minmax_element(add_timings.begin()+1, add_timings.end()); - auto triad_minmax = std::minmax_element(triad_timings.begin()+1, triad_timings.end()); - - double copy_average = std::accumulate(copy_timings.begin()+1, copy_timings.end(), 0.0) / (double)(ntimes - 1); - double mul_average = std::accumulate(mul_timings.begin()+1, mul_timings.end(), 0.0) / (double)(ntimes - 1); - double add_average = std::accumulate(add_timings.begin()+1, add_timings.end(), 0.0) / (double)(ntimes - 1); - double triad_average = std::accumulate(triad_timings.begin()+1, triad_timings.end(), 0.0) / (double)(ntimes - 1); - - - // Display results + // Display timing results std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12) << "MBytes/sec" @@ -107,39 +89,33 @@ int main(int argc, char *argv[]) std::cout << std::fixed; - std::cout - << std::left << std::setw(12) << "Copy" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*copy_minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.second - << std::left << std::setw(12) << std::setprecision(5) << copy_average - << std::endl; + std::string labels[4] = {"Copy", "Mul", "Add", "Triad"}; + size_t sizes[4] = { + 2 * sizeof(double) * ARRAY_SIZE, + 2 * sizeof(double) * ARRAY_SIZE, + 3 * sizeof(double) * ARRAY_SIZE, + 3 * sizeof(double) * ARRAY_SIZE + }; + for (int i = 0; i < 4; i++) + { + // Get min/max; ignore the first result + auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); - std::cout - << std::left << std::setw(12) << "Mul" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*mul_minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.second - << std::left << std::setw(12) << std::setprecision(5) << mul_average - << std::endl; + // Calculate average; ignore the first result + double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(ntimes - 1); + // Display results + std::cout + << std::left << std::setw(12) << labels[i] + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *minmax.second + << std::left << std::setw(12) << std::setprecision(5) << average + << std::endl; + - std::cout - << std::left << std::setw(12) << "Add" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*add_minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.second - << std::left << std::setw(12) << std::setprecision(5) << add_average - << std::endl; - - std::cout - << std::left << std::setw(12) << "Triad" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*triad_minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.second - << std::left << std::setw(12) << std::setprecision(5) << triad_average - << std::endl; + } delete[] stream; From 59fe9738b654b7cd2ad96e856ee7a0232a73cb76 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 12:03:50 +0100 Subject: [PATCH 024/117] Add a templated run function to make double/float switch easy --- src/main.cpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 42dbe28..d09c3c9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -20,6 +20,9 @@ const unsigned int ntimes = 10; template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); +template +void run(); + int main(int argc, char *argv[]) { std::cout @@ -27,16 +30,22 @@ int main(int argc, char *argv[]) << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl; + run(); +} + +template +void run() +{ // Create host vectors - std::vector a(ARRAY_SIZE, 1.0); - std::vector b(ARRAY_SIZE, 2.0); - std::vector c(ARRAY_SIZE, 0.0); + std::vector a(ARRAY_SIZE, 1.0); + std::vector b(ARRAY_SIZE, 2.0); + std::vector c(ARRAY_SIZE, 0.0); - Stream *stream; + Stream *stream; // Use the CUDA implementation - stream = new CUDAStream(ARRAY_SIZE); + stream = new CUDAStream(ARRAY_SIZE); stream->write_arrays(a, b, c); @@ -77,7 +86,7 @@ int main(int argc, char *argv[]) // Check solutions stream->read_arrays(a, b, c); - check_solution(ntimes, a, b, c); + check_solution(ntimes, a, b, c); // Display timing results std::cout @@ -91,10 +100,10 @@ int main(int argc, char *argv[]) std::string labels[4] = {"Copy", "Mul", "Add", "Triad"}; size_t sizes[4] = { - 2 * sizeof(double) * ARRAY_SIZE, - 2 * sizeof(double) * ARRAY_SIZE, - 3 * sizeof(double) * ARRAY_SIZE, - 3 * sizeof(double) * ARRAY_SIZE + 2 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE }; for (int i = 0; i < 4; i++) From a745ffc72413ae419afa997d780dda4b47266e3a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 12:07:09 +0100 Subject: [PATCH 025/117] Add more keywords to CUDA header --- src/CUDAStream.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 89ad536..e6505c6 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -26,8 +26,8 @@ class CUDAStream : public Stream virtual void mul() override; virtual void triad() override; - void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c); - void read_arrays(std::vector& a, std::vector& b, std::vector& c); + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; From 38e1e3b704a68e01cfcdb552d05f2acd44133578 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 12:59:14 +0100 Subject: [PATCH 026/117] Add starts of OpenCL implementation --- src/OCLStream.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++++++ src/OCLStream.h | 43 ++++++++++++++++++++++++++ src/main.cpp | 8 +++-- 3 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 src/OCLStream.cpp create mode 100644 src/OCLStream.h diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp new file mode 100644 index 0000000..eac19c0 --- /dev/null +++ b/src/OCLStream.cpp @@ -0,0 +1,77 @@ + +#include "OCLStream.h" + +template +OCLStream::OCLStream(const unsigned int ARRAY_SIZE) +{ + array_size = ARRAY_SIZE; + + // Setup default OpenCL GPU + context = cl::Context::getDefault(); + //queue = cl::CommandQueue::getDefault(); + + // Create program + + std::string kernels{R"CLC( + + const double scalar = 3.0; + + kernel void copy( + global const double * restrict a, + global double * restrict c) + { + const size_t i = get_global_id(0); + c[i] = a[i]; + } + )CLC"}; + +std::cout << kernels << std::endl; + + //cl::Program program(kernels); + //program.build(); + +exit(-1); + + +} + +template +void OCLStream::copy() +{ + return; +} + +template +void OCLStream::mul() +{ + return; +} + +template +void OCLStream::add() +{ + return; +} + +template +void OCLStream::triad() +{ + return; +} + +template +void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + return; +} + +template +void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + return; +} + + +template class OCLStream; +template class OCLStream; + diff --git a/src/OCLStream.h b/src/OCLStream.h new file mode 100644 index 0000000..a5cac61 --- /dev/null +++ b/src/OCLStream.h @@ -0,0 +1,43 @@ + +#include +#include + +#define CL_HPP_ENABLE_EXCEPTIONS +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 + +#include "cl2.hpp" + +#include "Stream.h" + +template +class OCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + cl::Buffer d_a; + cl::Buffer d_b; + cl::Buffer d_c; + + // OpenCL objects + cl::Context context; + cl::CommandQueue queue; + + public: + + OCLStream(const unsigned int); + ~OCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/src/main.cpp b/src/main.cpp index d09c3c9..39cfe86 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,7 +10,8 @@ #include "common.h" #include "Stream.h" -#include "CUDAStream.h" + +#include "OCLStream.h" const unsigned int ARRAY_SIZE = 52428800; @@ -45,7 +46,10 @@ void run() Stream *stream; // Use the CUDA implementation - stream = new CUDAStream(ARRAY_SIZE); + //stream = new CUDAStream(ARRAY_SIZE); + + // Use the OpenCL implementation + stream = new OCLStream(ARRAY_SIZE); stream->write_arrays(a, b, c); From eeaf9358ab5cb39e8437fce34c1ff4b3a2275061 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 15:01:43 +0100 Subject: [PATCH 027/117] Create OCL kernel functors --- src/OCLStream.cpp | 80 +++++++++++++++++++++++++++++++++++------------ src/OCLStream.h | 5 +++ 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index eac19c0..f60c161 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -1,6 +1,47 @@ #include "OCLStream.h" +std::string kernels{R"CLC( + + constant TYPE scalar = 3.0; + + kernel void copy( + global const TYPE * restrict a, + global TYPE * restrict c) + { + const size_t i = get_global_id(0); + c[i] = a[i]; + } + + kernel void mul( + global TYPE * restrict b, + global const TYPE * restrict c) + { + const size_t i = get_global_id(0); + b[i] = scalar * c[i]; + } + + kernel void add( + global const TYPE * restrict a, + global const TYPE * restrict b, + global TYPE * restrict c) + { + const size_t i = get_global_id(0); + c[i] = a[i] + b[i]; + } + + kernel void triad( + global TYPE * restrict a, + global const TYPE * restrict b, + global const TYPE * restrict c) + { + const size_t i = get_global_id(0); + a[i] = b[i] + scalar * c[i]; + } + +)CLC"}; + + template OCLStream::OCLStream(const unsigned int ARRAY_SIZE) { @@ -8,31 +49,30 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE) // Setup default OpenCL GPU context = cl::Context::getDefault(); - //queue = cl::CommandQueue::getDefault(); + queue = cl::CommandQueue::getDefault(); // Create program + cl::Program program(kernels); + if (sizeof(T) == sizeof(double)) + program.build("-DTYPE=double"); + else if (sizeof(T) == sizeof(float)) + program.build("-DTYPE=float"); - std::string kernels{R"CLC( - - const double scalar = 3.0; - - kernel void copy( - global const double * restrict a, - global double * restrict c) - { - const size_t i = get_global_id(0); - c[i] = a[i]; - } - )CLC"}; - -std::cout << kernels << std::endl; - - //cl::Program program(kernels); - //program.build(); - -exit(-1); + // Create kernels + copy_kernel = new cl::KernelFunctor(program, "copy"); + mul_kernel = new cl::KernelFunctor(program, "mul"); + add_kernel = new cl::KernelFunctor(program, "add"); + triad_kernel = new cl::KernelFunctor(program, "triad"); +} +template +OCLStream::~OCLStream() +{ + delete[] copy_kernel; + delete[] mul_kernel; + delete[] add_kernel; + delete[] triad_kernel; } template diff --git a/src/OCLStream.h b/src/OCLStream.h index a5cac61..28a9be1 100644 --- a/src/OCLStream.h +++ b/src/OCLStream.h @@ -26,6 +26,11 @@ class OCLStream : public Stream cl::Context context; cl::CommandQueue queue; + cl::KernelFunctor *copy_kernel; + cl::KernelFunctor * mul_kernel; + cl::KernelFunctor *add_kernel; + cl::KernelFunctor *triad_kernel; + public: OCLStream(const unsigned int); From 77f6df856cceab0c2ac99bad1fd24669d54b84c7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 15:05:01 +0100 Subject: [PATCH 028/117] Call kernels in OCL --- src/OCLStream.cpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index f60c161..4aa2da6 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -78,25 +78,41 @@ OCLStream::~OCLStream() template void OCLStream::copy() { - return; + (*copy_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_c + ); + queue.finish(); } template void OCLStream::mul() { - return; + (*mul_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_b, d_c + ); + queue.finish(); } template void OCLStream::add() { - return; + (*add_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c + ); + queue.finish(); } template void OCLStream::triad() { - return; + (*triad_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c + ); + queue.finish(); } template From b5149691935822904b38a7749e39777feddf1cd6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 15:08:12 +0100 Subject: [PATCH 029/117] Create OCL device buffers --- src/OCLStream.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index 4aa2da6..9e27aa9 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -64,6 +64,11 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE) add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + // Create buffers + d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + } template @@ -118,6 +123,7 @@ void OCLStream::triad() template void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { + return; } From 088778977b3719e3658de91d1d65fdddb2a5844a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 15:11:02 +0100 Subject: [PATCH 030/117] Add OCL copy functions --- src/OCLStream.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index 9e27aa9..e5145b4 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -123,14 +123,17 @@ void OCLStream::triad() template void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { - - return; + cl::copy(a.begin(), a.end(), d_a); + cl::copy(b.begin(), b.end(), d_b); + cl::copy(c.begin(), c.end(), d_c); } template void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - return; + cl::copy(d_a, a.begin(), a.end()); + cl::copy(d_b, b.begin(), b.end()); + cl::copy(d_c, c.begin(), c.end()); } From b9e70e11ab34dbeed383ca145f1a79cfc0bfcc43 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 16:58:32 +0100 Subject: [PATCH 031/117] Add CMakeLists.txt file with CUDA and OCL builds --- CMakeLists.txt | 19 +++++++++++++++++++ src/main.cpp | 10 +++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5e1abd6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,19 @@ + +cmake_minimum_required(VERSION 2.8) + +project(gpu-stream) + +set(CMAKE_SOURCE_DIR src/) + +list(APPEND CMAKE_CXX_FLAGS --std=c++11) + +find_package(CUDA REQUIRED) + +cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu) +target_compile_definitions(cuda.exe PUBLIC CUDA) + +find_package(OpenCL) +add_executable(ocl.exe src/main.cpp src/OCLStream.cpp) +target_compile_definitions(ocl.exe PUBLIC OCL) +target_link_libraries(ocl.exe ${OpenCL_LIBRARY}) + diff --git a/src/main.cpp b/src/main.cpp index 39cfe86..008a6a2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,11 @@ #include "common.h" #include "Stream.h" +#if defined(CUDA) +#include "CUDAStream.h" +#elif defined(OCL) #include "OCLStream.h" +#endif const unsigned int ARRAY_SIZE = 52428800; @@ -45,12 +49,16 @@ void run() Stream *stream; +#if defined(CUDA) // Use the CUDA implementation - //stream = new CUDAStream(ARRAY_SIZE); + stream = new CUDAStream(ARRAY_SIZE); +#elif defined(OCL) // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE); +#endif + stream->write_arrays(a, b, c); // List of times From 7006871cbe0beda6c9c444c495f34dff23af696e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 17:10:14 +0100 Subject: [PATCH 032/117] Get version from CMake configued header and only build implementations which have the runtime around --- CMakeLists.txt | 20 ++++++++++++++++---- src/common.h | 4 ---- src/common.h.in | 4 ++++ 3 files changed, 20 insertions(+), 8 deletions(-) delete mode 100644 src/common.h create mode 100644 src/common.h.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e1abd6..5bb8a4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,17 +3,29 @@ cmake_minimum_required(VERSION 2.8) project(gpu-stream) -set(CMAKE_SOURCE_DIR src/) +set(gpu-stream_VERSION_MAJOR 2) +set(gpu-stream_VERSION_MINOR 0) list(APPEND CMAKE_CXX_FLAGS --std=c++11) -find_package(CUDA REQUIRED) - +find_package(CUDA QUIET) +if (${CUDA_FOUND}) +set(IMPLEMENTATION CUDA) +configure_file(src/common.h.in src/common_cuda.h) cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu) target_compile_definitions(cuda.exe PUBLIC CUDA) +else (${CUDA_FOUND}) +message("Skipping CUDA...") +endif (${CUDA_FOUND}) -find_package(OpenCL) +find_package(OpenCL QUIET) +if (${OpenCL_FOUND}) +set(gpu-stream_IMPLEMENTATION OpenCL) +configure_file(src/common.h.in src/common_ocl.h) add_executable(ocl.exe src/main.cpp src/OCLStream.cpp) target_compile_definitions(ocl.exe PUBLIC OCL) target_link_libraries(ocl.exe ${OpenCL_LIBRARY}) +else (${OpenCL_FOUND}) +message("Skipping OpenCL...") +endif (${OpenCL_FOUND}) diff --git a/src/common.h b/src/common.h deleted file mode 100644 index c4bdadb..0000000 --- a/src/common.h +++ /dev/null @@ -1,4 +0,0 @@ - -#define VERSION_STRING "2.0" -#define IMPLEMENTATION_STRING "CUDA" - diff --git a/src/common.h.in b/src/common.h.in new file mode 100644 index 0000000..44dc040 --- /dev/null +++ b/src/common.h.in @@ -0,0 +1,4 @@ + +#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" +#define IMPLEMENTATION_STRING "@gpu-stream_IMPLEMENTATION@" + From a1cab96c57ecbf9c68dd7623251b8cbd7e80417d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 17:20:40 +0100 Subject: [PATCH 033/117] Define the implementaiton strings in each implementation header --- CMakeLists.txt | 20 +++++++++----------- src/CUDAStream.h | 2 ++ src/OCLStream.h | 2 ++ src/common.h.in | 1 - 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bb8a4b..8822353 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,24 +8,22 @@ set(gpu-stream_VERSION_MINOR 0) list(APPEND CMAKE_CXX_FLAGS --std=c++11) +configure_file(src/common.h.in src/common.h) + find_package(CUDA QUIET) if (${CUDA_FOUND}) -set(IMPLEMENTATION CUDA) -configure_file(src/common.h.in src/common_cuda.h) -cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu) -target_compile_definitions(cuda.exe PUBLIC CUDA) + cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu) + target_compile_definitions(cuda.exe PUBLIC CUDA) else (${CUDA_FOUND}) -message("Skipping CUDA...") + message("Skipping CUDA...") endif (${CUDA_FOUND}) find_package(OpenCL QUIET) if (${OpenCL_FOUND}) -set(gpu-stream_IMPLEMENTATION OpenCL) -configure_file(src/common.h.in src/common_ocl.h) -add_executable(ocl.exe src/main.cpp src/OCLStream.cpp) -target_compile_definitions(ocl.exe PUBLIC OCL) -target_link_libraries(ocl.exe ${OpenCL_LIBRARY}) + add_executable(ocl.exe src/main.cpp src/OCLStream.cpp) + target_compile_definitions(ocl.exe PUBLIC OCL) + target_link_libraries(ocl.exe ${OpenCL_LIBRARY}) else (${OpenCL_FOUND}) -message("Skipping OpenCL...") + message("Skipping OpenCL...") endif (${OpenCL_FOUND}) diff --git a/src/CUDAStream.h b/src/CUDAStream.h index e6505c6..34e0303 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -4,6 +4,8 @@ #include "Stream.h" +#define IMPLEMENTATION_STRING "CUDA" + template class CUDAStream : public Stream { diff --git a/src/OCLStream.h b/src/OCLStream.h index 28a9be1..f9c133e 100644 --- a/src/OCLStream.h +++ b/src/OCLStream.h @@ -10,6 +10,8 @@ #include "Stream.h" +#define IMPLEMENTATION_STRING "OpenCL" + template class OCLStream : public Stream { diff --git a/src/common.h.in b/src/common.h.in index 44dc040..fbf953c 100644 --- a/src/common.h.in +++ b/src/common.h.in @@ -1,4 +1,3 @@ #define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" -#define IMPLEMENTATION_STRING "@gpu-stream_IMPLEMENTATION@" From d1f8cd1b481d773790508d38545fc0f6856744a5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 23:06:06 +0100 Subject: [PATCH 034/117] Implement some CUDA routines for device info --- src/CUDAStream.cu | 22 ++++++++++++++++++++++ src/CUDAStream.h | 4 ++++ src/Stream.h | 7 ++++--- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 7c9afb3..3a65f15 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -134,6 +134,28 @@ void CUDAStream::triad() check_error(); } +template +std::string CUDAStream::getDeviceName(const int device) +{ + cudaSetDevice(device); + check_error(); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device); + check_error(); + return std::string(props.name); +} + +template +std::string CUDAStream::getDeviceDriver(const int device) +{ + cudaSetDevice(device); + check_error(); + int driver; + cudaDriverGetVersion(&driver); + check_error(); + return std::to_string(driver); +} + template class CUDAStream; template class CUDAStream; diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 34e0303..c741f5c 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -31,5 +31,9 @@ class CUDAStream : public Stream virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + static void listDevices(void); + static std::string getDeviceName(const int device); + static std::string getDeviceDriver(const int device); + }; diff --git a/src/Stream.h b/src/Stream.h index ecf043b..573af83 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -2,6 +2,7 @@ #pragma once #include +#include template class Stream @@ -19,9 +20,9 @@ class Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; // Implementation specific device functions - static std::vector getDeviceList(); - static std::vector getDeviceName(); - static std::vector getDeviceDriver(); + static void listDevices(void); + static std::string getDeviceName(const int); + static std::string getDeviceDriver(const int); }; From f5ba77f4bd7fd99043eb9cf816172ac144a19aa5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 23:20:10 +0100 Subject: [PATCH 035/117] List CUDA devices function --- src/CUDAStream.cu | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 3a65f15..2805c1e 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -134,11 +134,35 @@ void CUDAStream::triad() check_error(); } + +template +void CUDAStream::listDevices(void) +{ + // Get number of devices + int count; + cudaGetDeviceCount(&count); + check_error(); + + // Print device names + if (count == 0) + { + std::cout << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + template std::string CUDAStream::getDeviceName(const int device) { - cudaSetDevice(device); - check_error(); cudaDeviceProp props; cudaGetDeviceProperties(&props, device); check_error(); From 00305ba120580daec77fb8923f4cd834d5005da2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 28 Apr 2016 23:37:53 +0100 Subject: [PATCH 036/117] Write to std err --- src/CUDAStream.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 2805c1e..7f5f25d 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -146,7 +146,7 @@ void CUDAStream::listDevices(void) // Print device names if (count == 0) { - std::cout << "No devices found." << std::endl; + std::cerr << "No devices found." << std::endl; } else { From 1a96b71935b2ee38d4f84be352d3d6df8e89d551 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 29 Apr 2016 13:59:31 +0100 Subject: [PATCH 037/117] First attempt at parse args --- src/main.cpp | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 008a6a2..cabac5f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "common.h" #include "Stream.h" @@ -20,6 +21,7 @@ const unsigned int ARRAY_SIZE = 52428800; const unsigned int ntimes = 10; +unsigned int deviceIndex = 0; template @@ -28,6 +30,8 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector template void run(); +void parseArguments(int argc, char *argv[]); + int main(int argc, char *argv[]) { std::cout @@ -35,7 +39,9 @@ int main(int argc, char *argv[]) << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl; - run(); + parseArguments(argc, argv); + + run(); } @@ -186,3 +192,34 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector } +int parseUInt(const char *str, unsigned int *output) +{ + std::size_t next; + *output = std::stoul(str, &next); + return !next; +} + +void parseArguments(int argc, char *argv[]) +{ + for (int i = 1; i < argc; i++) + { + if (!std::string("--list").compare(argv[i])) + { + #if defined(CUDA) + CUDAStream::listDevices(); + #elif defined(OCL) + OCLStream::listDevices(); + #endif + exit(EXIT_SUCCESS); + } + else if (!std::string("--device").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) + { + std::cerr << "Invalid device index." << std::endl; + exit(EXIT_FAILURE); + } + } + } +} + From 3c394b9db08f58855a6e9efea9e8f9e48ebaab08 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 29 Apr 2016 18:28:21 +0100 Subject: [PATCH 038/117] Move device functions outside class --- src/CUDAStream.cu | 11 +++++------ src/CUDAStream.h | 4 ---- src/Stream.h | 10 +++++----- src/main.cpp | 6 +----- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 7f5f25d..e7ce539 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -135,8 +135,7 @@ void CUDAStream::triad() } -template -void CUDAStream::listDevices(void) +void listDevices(void) { // Get number of devices int count; @@ -160,8 +159,8 @@ void CUDAStream::listDevices(void) } } -template -std::string CUDAStream::getDeviceName(const int device) + +std::string getDeviceName(const int device) { cudaDeviceProp props; cudaGetDeviceProperties(&props, device); @@ -169,8 +168,8 @@ std::string CUDAStream::getDeviceName(const int device) return std::string(props.name); } -template -std::string CUDAStream::getDeviceDriver(const int device) + +std::string getDeviceDriver(const int device) { cudaSetDevice(device); check_error(); diff --git a/src/CUDAStream.h b/src/CUDAStream.h index c741f5c..34e0303 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -31,9 +31,5 @@ class CUDAStream : public Stream virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - static void listDevices(void); - static std::string getDeviceName(const int device); - static std::string getDeviceDriver(const int device); - }; diff --git a/src/Stream.h b/src/Stream.h index 573af83..c31d62d 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -19,10 +19,10 @@ class Stream virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; - // Implementation specific device functions - static void listDevices(void); - static std::string getDeviceName(const int); - static std::string getDeviceDriver(const int); - }; + +// Implementation specific device functions +static void listDevices(void); +static std::string getDeviceName(const int); +static std::string getDeviceDriver(const int); diff --git a/src/main.cpp b/src/main.cpp index cabac5f..0b9bc90 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -205,11 +205,7 @@ void parseArguments(int argc, char *argv[]) { if (!std::string("--list").compare(argv[i])) { - #if defined(CUDA) - CUDAStream::listDevices(); - #elif defined(OCL) - OCLStream::listDevices(); - #endif + listDevices(); exit(EXIT_SUCCESS); } else if (!std::string("--device").compare(argv[i])) From d557915007e358b1df55e21f6ee71bfd26b87fe4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 29 Apr 2016 18:36:47 +0100 Subject: [PATCH 039/117] Remove static keyword --- src/Stream.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Stream.h b/src/Stream.h index c31d62d..527fe6a 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -23,6 +23,6 @@ class Stream // Implementation specific device functions -static void listDevices(void); -static std::string getDeviceName(const int); -static std::string getDeviceDriver(const int); +void listDevices(void); +std::string getDeviceName(const int); +std::string getDeviceDriver(const int); From 2cb4fe74b15017acdf67c65e0c6b574f7f738eee Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 29 Apr 2016 18:38:49 +0100 Subject: [PATCH 040/117] Use original parseUInt function --- src/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 0b9bc90..28b5ab5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "common.h" #include "Stream.h" @@ -194,9 +194,9 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector int parseUInt(const char *str, unsigned int *output) { - std::size_t next; - *output = std::stoul(str, &next); - return !next; + char *next; + *output = strtoul(str, &next, 10); + return !strlen(next); } void parseArguments(int argc, char *argv[]) From 72ddd05f61507f303143826d3e7db2ca395694d8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 29 Apr 2016 18:45:57 +0100 Subject: [PATCH 041/117] Add parse arguments code --- src/main.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 28b5ab5..11d11bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,9 +19,10 @@ #endif -const unsigned int ARRAY_SIZE = 52428800; -const unsigned int ntimes = 10; +unsigned int ARRAY_SIZE = 52428800; +unsigned int num_times = 10; unsigned int deviceIndex = 0; +bool use_float = false; template @@ -74,7 +75,7 @@ void run() std::chrono::high_resolution_clock::time_point t1, t2; // Main loop - for (unsigned int k = 0; k < ntimes; k++) + for (unsigned int k = 0; k < num_times; k++) { // Execute Copy t1 = std::chrono::high_resolution_clock::now(); @@ -104,7 +105,7 @@ void run() // Check solutions stream->read_arrays(a, b, c); - check_solution(ntimes, a, b, c); + check_solution(num_times, a, b, c); // Display timing results std::cout @@ -130,7 +131,7 @@ void run() auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); // Calculate average; ignore the first result - double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(ntimes - 1); + double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1); // Display results std::cout @@ -216,6 +217,50 @@ void parseArguments(int argc, char *argv[]) exit(EXIT_FAILURE); } } + else if (!std::string("--arraysize").compare(argv[i]) || + !std::string("-s").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) + { + std::cerr << "Invalid array size." << std::endl; + exit(EXIT_FAILURE); + } + } + else if (!std::string("--numtimes").compare(argv[i]) || + !std::string("-n").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &num_times)) + { + std::cerr << "Invalid number of times." << std::endl; + exit(EXIT_FAILURE); + } + } + else if (!std::string("--float").compare(argv[i])) + { + use_float = true; + std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; + } + else if (!std::string("--help").compare(argv[i]) || + !std::string("-h").compare(argv[i])) + { + std::cout << std::endl; + std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl; + std::cout << "Options:" << std::endl; + std::cout << " -h --help Print the message" << std::endl; + std::cout << " --list List available devices" << std::endl; + std::cout << " --device INDEX Select device at INDEX" << std::endl; + std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; + std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --float Use floats (rather than doubles)" << std::endl; + std::cout << std::endl; + exit(EXIT_SUCCESS); + } + else + { + std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" + << std::endl; + exit(EXIT_FAILURE); + } } } From ac553589646602b0c70f55abe3423734bf2c6b3e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 10:51:16 +0100 Subject: [PATCH 042/117] Implement device info functions --- src/OCLStream.cpp | 83 +++++++++++++++++++++++++++++++++++++++++++++++ src/OCLStream.h | 4 +++ 2 files changed, 87 insertions(+) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index e5145b4..34b7933 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -136,6 +136,89 @@ void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector cl::copy(d_c, c.begin(), c.end()); } +// Cache list of devices +bool cached = false; +std::vector devices; + +void getDeviceList(void) +{ + // Get list of platforms + std::vector platforms; + cl::Platform::get(&platforms); + + // Enumerate devices + for (unsigned i = 0; i < platforms.size(); i++) + { + std::vector plat_devices; + platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); + devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); + } + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } + + +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + cl_device_info info = CL_DEVICE_NAME; + + if (device < devices.size()) + { + devices[device].getInfo(info, &name); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; + +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + devices[device].getInfo(CL_DRIVER_VERSION, &driver); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + template class OCLStream; template class OCLStream; diff --git a/src/OCLStream.h b/src/OCLStream.h index f9c133e..551cf28 100644 --- a/src/OCLStream.h +++ b/src/OCLStream.h @@ -1,4 +1,6 @@ +#pragma once + #include #include @@ -48,3 +50,5 @@ class OCLStream : public Stream }; +// Populate the devices list +void getDeviceList(void); From 77b521f5f0441f4d0bdd9f7bb3f73a03e210a83a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 10:52:27 +0100 Subject: [PATCH 043/117] Use float or double from CLI --- src/main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 11d11bb..7ec7873 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,7 +42,10 @@ int main(int argc, char *argv[]) parseArguments(argc, argv); - run(); + if (use_float) + run(); + else + run(); } From d7c17d72d56247a04bcf8c58ca4586282d1e234b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:02:33 +0100 Subject: [PATCH 044/117] Use device index from CLI in OpenCL --- src/OCLStream.cpp | 40 ++++++++++++++++++++++++++-------------- src/OCLStream.h | 3 ++- src/main.cpp | 5 ++--- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index 34b7933..422b409 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -1,6 +1,11 @@ #include "OCLStream.h" +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + std::string kernels{R"CLC( constant TYPE scalar = 3.0; @@ -43,16 +48,27 @@ std::string kernels{R"CLC( template -OCLStream::OCLStream(const unsigned int ARRAY_SIZE) +OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) { + if (!cached) + getDeviceList(); + array_size = ARRAY_SIZE; // Setup default OpenCL GPU - context = cl::Context::getDefault(); - queue = cl::CommandQueue::getDefault(); + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + device = devices[device_index]; + + // Print out device information + std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + context = cl::Context(device); + queue = cl::CommandQueue(context); // Create program - cl::Program program(kernels); + cl::Program program(context, kernels); if (sizeof(T) == sizeof(double)) program.build("-DTYPE=double"); else if (sizeof(T) == sizeof(float)) @@ -123,23 +139,19 @@ void OCLStream::triad() template void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { - cl::copy(a.begin(), a.end(), d_a); - cl::copy(b.begin(), b.end(), d_b); - cl::copy(c.begin(), c.end(), d_c); + cl::copy(queue, a.begin(), a.end(), d_a); + cl::copy(queue, b.begin(), b.end(), d_b); + cl::copy(queue, c.begin(), c.end(), d_c); } template void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - cl::copy(d_a, a.begin(), a.end()); - cl::copy(d_b, b.begin(), b.end()); - cl::copy(d_c, c.begin(), c.end()); + cl::copy(queue, d_a, a.begin(), a.end()); + cl::copy(queue, d_b, b.begin(), b.end()); + cl::copy(queue, d_c, c.begin(), c.end()); } -// Cache list of devices -bool cached = false; -std::vector devices; - void getDeviceList(void) { // Get list of platforms diff --git a/src/OCLStream.h b/src/OCLStream.h index 551cf28..3fd144a 100644 --- a/src/OCLStream.h +++ b/src/OCLStream.h @@ -27,6 +27,7 @@ class OCLStream : public Stream cl::Buffer d_c; // OpenCL objects + cl::Device device; cl::Context context; cl::CommandQueue queue; @@ -37,7 +38,7 @@ class OCLStream : public Stream public: - OCLStream(const unsigned int); + OCLStream(const unsigned int, const int); ~OCLStream(); virtual void copy() override; diff --git a/src/main.cpp b/src/main.cpp index 7ec7873..1171180 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -61,11 +61,11 @@ void run() #if defined(CUDA) // Use the CUDA implementation - stream = new CUDAStream(ARRAY_SIZE); + stream = new CUDAStream(ARRAY_SIZE, deviceIndex); #elif defined(OCL) // Use the OpenCL implementation - stream = new OCLStream(ARRAY_SIZE); + stream = new OCLStream(ARRAY_SIZE, deviceIndex); #endif @@ -144,7 +144,6 @@ void run() << std::left << std::setw(12) << std::setprecision(5) << *minmax.second << std::left << std::setw(12) << std::setprecision(5) << average << std::endl; - } From 3462e61c162becfb38f46e684a70badc21eae604 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:05:21 +0100 Subject: [PATCH 045/117] Check device support float --- src/OCLStream.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index 422b409..ed419b5 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -70,7 +70,12 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) // Create program cl::Program program(context, kernels); if (sizeof(T) == sizeof(double)) + { + // Check device can do double + if (!device.getInfo()) + throw std::runtime_error("Device does not support double precision, please use --float"); program.build("-DTYPE=double"); + } else if (sizeof(T) == sizeof(float)) program.build("-DTYPE=float"); From fd121c2467ef0c3554f3b8e292d2b06559905b00 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:15:38 +0100 Subject: [PATCH 046/117] Use device info to select CUDA device --- src/CUDAStream.cu | 16 +++++++++++++++- src/CUDAStream.h | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index e7ce539..956be7d 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -12,8 +12,22 @@ void check_error(void) } template -CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) +CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) { + + // Set device + int count; + cudaGetDeviceCount(&count); + check_error(); + if (device_index >= count) + throw std::runtime_error("Invalid device index"); + cudaSetDevice(device_index); + check_error(); + + // Print out device information + std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + array_size = ARRAY_SIZE; // Check buffers fit on the device diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 34e0303..61e4882 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -20,7 +20,7 @@ class CUDAStream : public Stream public: - CUDAStream(const unsigned int); + CUDAStream(const unsigned int, const int); ~CUDAStream(); virtual void copy() override; From 2738e75b0400da523b094edd7491701c6a38545e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:20:39 +0100 Subject: [PATCH 047/117] Print out array sizes --- src/main.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index 1171180..a0fbaf9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -56,6 +56,13 @@ void run() std::vector a(ARRAY_SIZE, 1.0); std::vector b(ARRAY_SIZE, 2.0); std::vector c(ARRAY_SIZE, 0.0); + std::streamsize ss = std::cout.precision(); + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout.precision(ss); Stream *stream; From 26bb9126466be72504e4710f6530e2d43112d5fb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:23:36 +0100 Subject: [PATCH 048/117] Check OCL device has enough memory for buffers --- src/OCLStream.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index ed419b5..e962ea5 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -53,8 +53,6 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) if (!cached) getDeviceList(); - array_size = ARRAY_SIZE; - // Setup default OpenCL GPU if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); @@ -85,6 +83,16 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + array_size = ARRAY_SIZE; + + // Check buffers fit on the device + cl_ulong totalmem = device.getInfo(); + cl_ulong maxbuffer = device.getInfo(); + if (maxbuffer < sizeof(T)*ARRAY_SIZE) + throw std::runtime_error("Device cannot allocate a buffer big enough"); + if (totalmem < 3*sizeof(T)*ARRAY_SIZE) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + // Create buffers d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); From e91c31b44af503f45ac554c01ec25ac4348163d2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:37:35 +0100 Subject: [PATCH 049/117] Tidy up delete of object with correct deconstructors and delete --- src/OCLStream.cpp | 8 ++++---- src/Stream.h | 3 +++ src/main.cpp | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp index e962ea5..5a72b3c 100644 --- a/src/OCLStream.cpp +++ b/src/OCLStream.cpp @@ -103,10 +103,10 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) template OCLStream::~OCLStream() { - delete[] copy_kernel; - delete[] mul_kernel; - delete[] add_kernel; - delete[] triad_kernel; + delete copy_kernel; + delete mul_kernel; + delete add_kernel; + delete triad_kernel; } template diff --git a/src/Stream.h b/src/Stream.h index 527fe6a..e02c953 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -8,6 +8,9 @@ template class Stream { public: + + virtual ~Stream(){} + // Kernels // These must be blocking calls virtual void copy() = 0; diff --git a/src/main.cpp b/src/main.cpp index a0fbaf9..457ca9a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -154,7 +154,7 @@ void run() } - delete[] stream; + delete stream; } From 95f9efb7d92a7ed741f2ce20d4c949fe89363a8b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:40:46 +0100 Subject: [PATCH 050/117] Remove old version --- Makefile | 27 --- common.cpp | 115 ---------- common.h | 112 ---------- cuda-stream.cu | 397 ---------------------------------- ocl-stream-kernels.cl | 70 ------ ocl-stream.cpp | 488 ------------------------------------------ 6 files changed, 1209 deletions(-) delete mode 100644 Makefile delete mode 100644 common.cpp delete mode 100644 common.h delete mode 100644 cuda-stream.cu delete mode 100644 ocl-stream-kernels.cl delete mode 100644 ocl-stream.cpp diff --git a/Makefile b/Makefile deleted file mode 100644 index 4fb5f7a..0000000 --- a/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -LDLIBS = -l OpenCL -CXXFLAGS = -std=c++11 -O3 - -PLATFORM = $(shell uname -s) -ifeq ($(PLATFORM), Darwin) - LDLIBS = -framework OpenCL -endif - -all: gpu-stream-ocl gpu-stream-cuda - -gpu-stream-ocl: ocl-stream.cpp common.o Makefile - $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) - -common.o: common.cpp common.h Makefile - -gpu-stream-cuda: cuda-stream.cu common.o Makefile -ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) - nvcc $(CXXFLAGS) common.o $< -o $@ -else - $(error "Cannot find nvcc, please install CUDA toolkit") -endif - -.PHONY: clean - -clean: - rm -f gpu-stream-ocl gpu-stream-cuda *.o - diff --git a/common.cpp b/common.cpp deleted file mode 100644 index 781d70e..0000000 --- a/common.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include "common.h" - -// Default array size 50 * 2^20 (50*8 Mebibytes double precision) -// Use binary powers of two so divides 1024 -unsigned int ARRAY_SIZE = 52428800; - -unsigned int NTIMES = 10; - -bool useFloat = false; - -unsigned int deviceIndex = 0; - -int parseUInt(const char *str, unsigned int *output) -{ - char *next; - *output = strtoul(str, &next, 10); - return !strlen(next); -} - -void parseArguments(int argc, char *argv[]) -{ - for (int i = 1; i < argc; i++) - { - if (!strcmp(argv[i], "--list")) - { - listDevices(); - exit(0); - } - else if (!strcmp(argv[i], "--device")) - { - if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) - { - std::cout << "Invalid device index" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s")) - { - if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) - { - std::cout << "Invalid array size" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n")) - { - if (++i >= argc || !parseUInt(argv[i], &NTIMES)) - { - std::cout << "Invalid number of times" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--float")) - { - useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; - } - else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) - { - std::cout << std::endl; - std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl; - std::cout << "Options:" << std::endl; - std::cout << " -h --help Print the message" << std::endl; - std::cout << " --list List available devices" << std::endl; - std::cout << " --device INDEX Select device at INDEX" << std::endl; - std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; - std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; - std::cout << " --float Use floats (rather than doubles)" << std::endl; - std::cout << std::endl; - exit(0); - } - else - { - std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')" - << std::endl; - exit(1); - } - } -} diff --git a/common.h b/common.h deleted file mode 100644 index a4dd886..0000000 --- a/common.h +++ /dev/null @@ -1,112 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include -#include -#include -#include -#include -#include - -#define VERSION_STRING "1.0" - -extern void parseArguments(int argc, char *argv[]); - -extern void listDevices(void); - -extern unsigned int ARRAY_SIZE; -extern unsigned int NTIMES; - -extern bool useFloat; - -extern unsigned int deviceIndex; - - -template < typename T > -void check_solution(void* a_in, void* b_in, void* c_in) -{ - // Generate correct solution - T golda = 1.0; - T goldb = 2.0; - T goldc = 0.0; - - T * a = static_cast(a_in); - T * b = static_cast(b_in); - T * c = static_cast(c_in); - - const T scalar = 3.0; - - for (unsigned int i = 0; i < NTIMES; i++) - { - // Double - goldc = golda; - goldb = scalar * goldc; - goldc = golda + goldb; - golda = goldb + scalar * goldc; - } - - // Calculate average error - double erra = 0.0; - double errb = 0.0; - double errc = 0.0; - - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - erra += fabs(a[i] - golda); - errb += fabs(b[i] - goldb); - errc += fabs(c[i] - goldc); - } - - erra /= ARRAY_SIZE; - errb /= ARRAY_SIZE; - errc /= ARRAY_SIZE; - - double epsi = std::numeric_limits::epsilon() * 100; - - if (erra > epsi) - std::cout - << "Validation failed on a[]. Average error " << erra - << std::endl; - if (errb > epsi) - std::cout - << "Validation failed on b[]. Average error " << errb - << std::endl; - if (errc > epsi) - std::cout - << "Validation failed on c[]. Average error " << errc - << std::endl; -} - diff --git a/cuda-stream.cu b/cuda-stream.cu deleted file mode 100644 index 2ab3adb..0000000 --- a/cuda-stream.cu +++ /dev/null @@ -1,397 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#include -#include "common.h" - -std::string getDeviceName(int device); -int getDriver(void); - -// Code to check CUDA errors -void check_cuda_error(void) -{ - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - std::cerr - << "Error: " - << cudaGetErrorString(err) - << std::endl; - exit(err); - } -} - -template -__global__ void copy(const T * a, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i]; -} - -template -__global__ void mul(T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - b[i] = scalar * c[i]; -} - -template -__global__ void add(const T * a, const T * b, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i] + b[i]; -} - -template -__global__ void triad(T * a, const T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] = b[i] + scalar * c[i]; -} - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; - - parseArguments(argc, argv); - - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Check device index is in range - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - if (deviceIndex >= count) - throw std::runtime_error("Chosen device index is invalid"); - cudaSetDevice(deviceIndex); - check_cuda_error(); - - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, deviceIndex); - if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - void * d_a, * d_b, *d_c; - cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - - // Copy host memory to device - cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - - // Make sure the copies are finished - cudaDeviceSynchronize(); - check_cuda_error(); - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - // Free cuda buffers - cudaFree(d_a); - check_cuda_error(); - cudaFree(d_b); - check_cuda_error(); - cudaFree(d_c); - check_cuda_error(); - -} - -std::string getDeviceName(int device) -{ - struct cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device); - check_cuda_error(); - return std::string(prop.name); -} - -int getDriver(void) -{ - int driver; - cudaDriverGetVersion(&driver); - check_cuda_error(); - return driver; -} - -void listDevices(void) -{ - // Get number of devices - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - - // Print device names - if (count == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < count; i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - check_cuda_error(); - } - std::cout << std::endl; - } -} - diff --git a/ocl-stream-kernels.cl b/ocl-stream-kernels.cl deleted file mode 100644 index e5af7ce..0000000 --- a/ocl-stream-kernels.cl +++ /dev/null @@ -1,70 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#ifdef FLOAT - #define DATATYPE float - constant DATATYPE scalar = 3.0f; -#else - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #define DATATYPE double - constant DATATYPE scalar = 3.0; -#endif - - -kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i]; -} - -kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - b[i] = scalar * c[i]; -} - -kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i] + b[i]; -} - -kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - a[i] = b[i] + scalar * c[i]; -} diff --git a/ocl-stream.cpp b/ocl-stream.cpp deleted file mode 100644 index 1a46295..0000000 --- a/ocl-stream.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#define CL_HPP_ENABLE_EXCEPTIONS -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 -#define CL_HPP_TARGET_OPENCL_VERSION 110 -#include "CL/cl2.hpp" -#include "common.h" - -std::string getDeviceName(const cl::Device& device); -std::string getDeviceDriver(const cl::Device& device); -unsigned getDeviceList(std::vector& devices); - - -// Print error and exit -void die(std::string msg, cl::Error& e) -{ - std::cerr - << "Error: " - << msg - << ": " << e.what() - << "(" << e.err() << ")" - << std::endl; - exit(e.err()); -} - - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: OpenCL" << std::endl; - - std::string status; - - try - { - parseArguments(argc, argv); - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Open the Kernel source - std::string kernels; - { - std::ifstream in("ocl-stream-kernels.cl"); - if (!in.is_open()) - throw std::runtime_error("Cannot open kernel file"); - kernels = std::string (std::istreambuf_iterator(in), (std::istreambuf_iterator())); - } - - - // Setup OpenCL - - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Check device index is in range - if (deviceIndex >= devices.size()) - throw std::runtime_error("Chosen device index is invalid"); - - cl::Device device = devices[deviceIndex]; - - status = "Creating context"; - cl::Context context(device); - - status = "Creating queue"; - cl::CommandQueue queue(context); - - status = "Creating program"; - cl::Program program(context, kernels); - - // Print out device name - std::string name = getDeviceName(device); - std::cout << "Using OpenCL device " << name << std::endl; - - // Print out OpenCL driver version for this device - std::string driver = getDeviceDriver(device); - std::cout << "Driver: " << driver << std::endl; - - // Check device can do double precision if requested - if (!useFloat && !device.getInfo()) - throw std::runtime_error("Device does not support double precision, please use --float"); - - // Check buffers fit on the device - status = "Getting device memory sizes"; - cl_ulong totalmem = device.getInfo(); - cl_ulong maxbuffer = device.getInfo(); - if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device cannot allocate a buffer big enough"); - if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - try - { - std::string options = ""; - if (useFloat) - options = "-DFLOAT"; - program.build(options.c_str()); - } - catch (cl::Error& e) - { - std::vector devices = context.getInfo(); - std::string buildlog = program.getBuildInfo(devices[0]); - std::cerr - << "Build error:" - << buildlog - << std::endl; - throw e; - } - - status = "Making kernel copy"; - auto copy = cl::KernelFunctor(program, "copy"); - status = "Making kernel mul"; - auto mul = cl::KernelFunctor(program, "mul"); - status = "Making kernel add"; - auto add = cl::KernelFunctor(program, "add"); - status = "Making kernel triad"; - auto triad = cl::KernelFunctor(program, "triad"); - - // Create host vectors - void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - status = "Creating buffers"; - cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - - - // Copy host memory to device - status = "Copying buffers"; - queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - - // Make sure the copies are finished - queue.finish(); - - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - status = "Executing copy"; - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - copy( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing mul"; - t1 = std::chrono::high_resolution_clock::now(); - mul( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing add"; - t1 = std::chrono::high_resolution_clock::now(); - add( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing triad"; - t1 = std::chrono::high_resolution_clock::now(); - triad( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - status = "Copying back buffers"; - queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - queue.finish(); - - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - } - catch (cl::Error &e) - { - die(status, e); - } - catch (std::exception& e) - { - std::cerr - << "Error: " - << e.what() - << std::endl; - exit(EXIT_FAILURE); - } - -} - - -unsigned getDeviceList(std::vector& devices) -{ - // Get list of platforms - std::vector platforms; - try - { - cl::Platform::get(&platforms); - } - catch (cl::Error &e) - { - die("Getting platforms", e); - } - - // Enumerate devices - for (unsigned int i = 0; i < platforms.size(); i++) - { - std::vector plat_devices; - try - { - platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); - } - catch (cl::Error &e) - { - die("Getting devices", e); - } - devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); - } - - return devices.size(); -} - - -std::string getDeviceName(const cl::Device& device) -{ - std::string name; - cl_device_info info = CL_DEVICE_NAME; - - try - { - - // Special case for AMD -#ifdef CL_DEVICE_BOARD_NAME_AMD - device.getInfo(CL_DEVICE_VENDOR, &name); - if (strstr(name.c_str(), "Advanced Micro Devices")) - info = CL_DEVICE_BOARD_NAME_AMD; -#endif - - device.getInfo(info, &name); - } - catch (cl::Error &e) - { - die("Getting device name", e); - } - - return name; -} - -std::string getDeviceDriver(const cl::Device& device) -{ - std::string driver; - try - { - device.getInfo(CL_DRIVER_VERSION, &driver); - } - catch (cl::Error &e) - { - die("Getting device driver", e); - } - - return driver; -} - - -void listDevices(void) -{ - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Print device names - if (devices.size() == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (unsigned i = 0; i < devices.size(); i++) - { - std::cout << i << ": " << getDeviceName(devices[i]) << std::endl; - } - std::cout << std::endl; - } -} - From 83516ae35279d78a1833572b2a66c0e0a6a5942e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:41:00 +0100 Subject: [PATCH 051/117] Update cl2.hpp --- CL/cl2.hpp | 209 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 127 insertions(+), 82 deletions(-) diff --git a/CL/cl2.hpp b/CL/cl2.hpp index ad0c7c4..e0f55fe 100644 --- a/CL/cl2.hpp +++ b/CL/cl2.hpp @@ -28,11 +28,11 @@ /*! \file * - * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), * OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29) * \author Lee Howes and Bruce Merry - * - * Derived from the OpenCL 1.x C++ bindings written by + * + * Derived from the OpenCL 1.x C++ bindings written by * Benedict R. Gaster, Laurent Morichetti and Lee Howes * With additions and fixes from: * Brian Cole, March 3rd 2010 and April 2012 @@ -52,6 +52,18 @@ * #define CL_HPP_USE_DX_INTEROP * cl_khr_sub_groups * #define CL_HPP_USE_CL_SUB_GROUPS_KHR + * + * Doxygen documentation for this header is available here: + * + * http://khronosgroup.github.io/OpenCL-CLHPP/ + * + * The latest version of this header can be found on the GitHub releases page: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP/releases + * + * Bugs and patches can be submitted to the GitHub repository: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP */ /*! \mainpage @@ -134,41 +146,64 @@ * * \section parameterization Parameters * This header may be parameterized by a set of preprocessor macros. - * CL_HPP_TARGET_OPENCL_VERSION - * - Defines the target OpenCL runtime version to build the header against. - * Defaults to 200, representing OpenCL 2.0. - * CL_HPP_NO_STD_STRING - * - Do not use the standard library string class. - * cl::string is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_VECTOR - * - Do not use the standard library vector class. - * cl::vector is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_ARRAY - * - Do not use the standard library array class. - * cl::array is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_UNIQUE_PTR - * - Do not use the standard library unique_ptr class. - * cl::pointer and the cl::allocate_pointer function are not defined - * and may be defined by the user before cl2.hpp is included. - * CL_HPP_ENABLE_DEVICE_FISSION - * - Enables device fission for OpenCL 1.2 platforms - * CL_HPP_ENABLE_EXCEPTIONS - * - Enable exceptions for use in the C++ bindings header. - * This is the preferred error handling mechanism but is not required. - * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY - * - Backward compatibility option to support cl.hpp-style size_t class. - * Replaces the updated std::array derived version and removal of size_t - * from the namespace. Note that in this case the new size_t class - * is placed in the cl::compatibility namespace and thus requires - * an additional using declaration for direct backward compatibility. - * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY - * - Enable older vector of pairs interface for construction of programs. - * CL_HPP_CL_1_2_DEFAULT_BUILD - * - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 - * - applies to use of cl::Program construction and other program build variants. + * + * - CL_HPP_TARGET_OPENCL_VERSION + * + * Defines the target OpenCL runtime version to build the header + * against. Defaults to 200, representing OpenCL 2.0. + * + * - CL_HPP_NO_STD_STRING + * + * Do not use the standard library string class. cl::string is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_VECTOR + * + * Do not use the standard library vector class. cl::vector is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_ARRAY + * + * Do not use the standard library array class. cl::array is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_UNIQUE_PTR + * + * Do not use the standard library unique_ptr class. cl::pointer and + * the cl::allocate_pointer functions are not defined and may be + * defined by the user before cl2.hpp is included. + * + * - CL_HPP_ENABLE_DEVICE_FISSION + * + * Enables device fission for OpenCL 1.2 platforms. + * + * - CL_HPP_ENABLE_EXCEPTIONS + * + * Enable exceptions for use in the C++ bindings header. This is the + * preferred error handling mechanism but is not required. + * + * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY + * + * Backward compatibility option to support cl.hpp-style size_t + * class. Replaces the updated std::array derived version and + * removal of size_t from the namespace. Note that in this case the + * new size_t class is placed in the cl::compatibility namespace and + * thus requires an additional using declaration for direct backward + * compatibility. + * + * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY + * + * Enable older vector of pairs interface for construction of + * programs. + * + * - CL_HPP_CL_1_2_DEFAULT_BUILD + * + * Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 + * applies to use of cl::Program construction and other program + * build variants. * * * \section example Example @@ -177,19 +212,19 @@ * bindings, including support for the optional exception feature and * also the supplied vector and string classes, see following sections for * decriptions of these features. - * + * * \code #define CL_HPP_ENABLE_EXCEPTIONS #define CL_HPP_TARGET_OPENCL_VERSION 200 - + #include #include #include #include #include - + const int numElements = 32; - + int main(void) { // Filter for a 2.0 platform and set it as the default @@ -212,35 +247,45 @@ std::cout << "Error setting default platform."; return -1; } - - std::string kernel1{ - "global int globalA;" - "kernel void updateGlobal(){" - " globalA = 75;" - "}"}; - std::string kernel2{ - "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){" - " output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);" - " write_pipe(outPipe, &val);" - " queue_t default_queue = get_default_queue(); " - " ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); " - // Have a child kernel write into third quarter of output - " enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, " - " ^{" - " output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;" - " });" - // Have a child kernel write into last quarter of output - " enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, " - " ^{" - " output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;" - " });" - "}" }; + + // Use C++11 raw string literals for kernel source code + std::string kernel1{R"CLC( + global int globalA; + kernel void updateGlobal() + { + globalA = 75; + } + )CLC"}; + std::string kernel2{R"CLC( + typedef struct { global int *bar; } Foo; + kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, + global int *output, int val, write_only pipe int outPipe, queue_t childQueue) + { + output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar); + write_pipe(outPipe, &val); + queue_t default_queue = get_default_queue(); + ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); + + // Have a child kernel write into third quarter of output + enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*2 + get_global_id(0)] = + inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA; + }); + + // Have a child kernel write into last quarter of output + enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*3 + get_global_id(0)] = + inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2; + }); + } + )CLC"}; // New simpler string interface style std::vector programStrings {kernel1, kernel2}; - cl::Program vectorAddProgram( - programStrings); + cl::Program vectorAddProgram(programStrings); try { vectorAddProgram.build("-cl-std=CL2.0"); } @@ -251,7 +296,7 @@ for (auto &pair : buildInfo) { std::cerr << pair.second << std::endl << std::endl; } - + return 1; } @@ -264,17 +309,17 @@ program2Kernel( cl::EnqueueArgs( cl::NDRange(1))); - + ////////////////// // SVM allocations - - cl::pointer anSVMInt = cl::allocate_svm>(); + + auto anSVMInt = cl::allocate_svm>(); *anSVMInt = 5; - cl::SVMAllocator>> svmAllocReadOnly; + cl::SVMAllocator>> svmAllocReadOnly; auto fooPointer = cl::allocate_pointer(svmAllocReadOnly); fooPointer->bar = anSVMInt.get(); cl::SVMAllocator> svmAlloc; - std::vector>> inputA(numElements, 1, svmAlloc); + std::vector>> inputA(numElements, 1, svmAlloc); cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); // @@ -284,7 +329,7 @@ std::vector output(numElements, 0xdeadbeef); cl::Buffer outputBuffer(begin(output), end(output), false); cl::Pipe aPipe(sizeof(cl_int), numElements / 2); - + // Default command queue, also passed in as a parameter cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault( cl::Context::getDefault(), cl::Device::getDefault()); @@ -339,7 +384,7 @@ return 0; } - * + * * \endcode * */ @@ -3538,7 +3583,7 @@ template cl::pointer> allocate_pointer(const Alloc &alloc_, Args&&... args) { Alloc alloc(alloc_); - static const size_t copies = 1; + static const size_type copies = 1; // Ensure that creation of the management block and the // object are dealt with separately such that we only provide a deleter @@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vectorresize(numBinaries); - for (int i = 0; i < numBinaries; ++i) { + for (size_type i = 0; i < numBinaries; ++i) { (*param)[i].resize(sizes[i]); } @@ -7107,7 +7152,7 @@ public: size_type buffer_slice_pitch, size_type host_row_pitch, size_type host_slice_pitch, - void *ptr, + const void *ptr, const vector* events = NULL, Event* event = NULL) const { @@ -7255,7 +7300,7 @@ public: const array& region, size_type row_pitch, size_type slice_pitch, - void* ptr, + const void* ptr, const vector* events = NULL, Event* event = NULL) const { @@ -7845,7 +7890,7 @@ public: CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( const Kernel& kernel, const vector* events = NULL, - Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const + Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect( size_type buffer_slice_pitch, size_type host_row_pitch, size_type host_slice_pitch, - void *ptr, + const void *ptr, const vector* events = NULL, Event* event = NULL) { @@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage( const array& region, size_type row_pitch, size_type slice_pitch, - void* ptr, + const void* ptr, const vector* events = NULL, Event* event = NULL) { From fcc9588c947b0becc587d3f021a6afbaa76a8744 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:41:40 +0100 Subject: [PATCH 052/117] Change cl2.hpp include --- src/OCLStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/OCLStream.h b/src/OCLStream.h index 3fd144a..79a213f 100644 --- a/src/OCLStream.h +++ b/src/OCLStream.h @@ -8,7 +8,7 @@ #define CL_HPP_TARGET_OPENCL_VERSION 120 #define CL_HPP_MINIMUM_OPENCL_VERSION 120 -#include "cl2.hpp" +#include "CL/cl2.hpp" #include "Stream.h" From a355acf2eee57047644ee3021557dd42d8a640b2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:43:25 +0100 Subject: [PATCH 053/117] Move source files to top level directory --- src/CUDAStream.cu => CUDAStream.cu | 0 src/CUDAStream.h => CUDAStream.h | 0 src/OCLStream.cpp => OCLStream.cpp | 0 src/OCLStream.h => OCLStream.h | 0 src/Stream.h => Stream.h | 0 src/common.h.in => common.h.in | 0 src/main.cpp => main.cpp | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename src/CUDAStream.cu => CUDAStream.cu (100%) rename src/CUDAStream.h => CUDAStream.h (100%) rename src/OCLStream.cpp => OCLStream.cpp (100%) rename src/OCLStream.h => OCLStream.h (100%) rename src/Stream.h => Stream.h (100%) rename src/common.h.in => common.h.in (100%) rename src/main.cpp => main.cpp (100%) diff --git a/src/CUDAStream.cu b/CUDAStream.cu similarity index 100% rename from src/CUDAStream.cu rename to CUDAStream.cu diff --git a/src/CUDAStream.h b/CUDAStream.h similarity index 100% rename from src/CUDAStream.h rename to CUDAStream.h diff --git a/src/OCLStream.cpp b/OCLStream.cpp similarity index 100% rename from src/OCLStream.cpp rename to OCLStream.cpp diff --git a/src/OCLStream.h b/OCLStream.h similarity index 100% rename from src/OCLStream.h rename to OCLStream.h diff --git a/src/Stream.h b/Stream.h similarity index 100% rename from src/Stream.h rename to Stream.h diff --git a/src/common.h.in b/common.h.in similarity index 100% rename from src/common.h.in rename to common.h.in diff --git a/src/main.cpp b/main.cpp similarity index 100% rename from src/main.cpp rename to main.cpp From 8ce15a28aa4a2bb1b086ce9fdde3b0954885518a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:45:25 +0100 Subject: [PATCH 054/117] Update CMake with better binary name and source location --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8822353..d71ea52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,21 +8,21 @@ set(gpu-stream_VERSION_MINOR 0) list(APPEND CMAKE_CXX_FLAGS --std=c++11) -configure_file(src/common.h.in src/common.h) +configure_file(common.h.in common.h) find_package(CUDA QUIET) if (${CUDA_FOUND}) - cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu) - target_compile_definitions(cuda.exe PUBLIC CUDA) + cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) + target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) else (${CUDA_FOUND}) message("Skipping CUDA...") endif (${CUDA_FOUND}) find_package(OpenCL QUIET) if (${OpenCL_FOUND}) - add_executable(ocl.exe src/main.cpp src/OCLStream.cpp) - target_compile_definitions(ocl.exe PUBLIC OCL) - target_link_libraries(ocl.exe ${OpenCL_LIBRARY}) + add_executable(gpu-stream-ocl main.cpp OCLStream.cpp) + target_compile_definitions(gpu-stream-ocl PUBLIC OCL) + target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY}) else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) From 1bd27428bd421b2baef47ca94e344dabe0f824a4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:17:21 +0100 Subject: [PATCH 055/117] Require CUDA 7 for C++11 support --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d71ea52..a165b49 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ list(APPEND CMAKE_CXX_FLAGS --std=c++11) configure_file(common.h.in common.h) -find_package(CUDA QUIET) +find_package(CUDA 7.0 QUIET) if (${CUDA_FOUND}) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) From 57ea4b8caefcb7d2ffc241a31bb71d0a65f1c227 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:17:33 +0100 Subject: [PATCH 056/117] Require CMake 3.2 so can check for C++11 nicely --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a165b49..6aa6aee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,14 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.2) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) project(gpu-stream) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) -list(APPEND CMAKE_CXX_FLAGS --std=c++11) - configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) From 662fcaf4b55bc7ea275f7a5f2ea73ed3eb830c5b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:18:41 +0100 Subject: [PATCH 057/117] Add CMake things to gitignore --- .gitignore | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a989b77..0fa7243 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,16 @@ +common.h + gpu-stream-cuda gpu-stream-ocl + *.o - *.tar - *.gz + +.DS_Store + +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +Makefile From 21c9022a3f5a435de8c880d1ea51eb949e921a60 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:24:33 +0100 Subject: [PATCH 058/117] Keep C++11 flag explicitely defined in Cmake --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6aa6aee..9dda6cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ project(gpu-stream) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) +list(APPEND CMAKE_CXX_FLAGS --std=c++11) + configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) From 95a10511ec121620244bccc19785a07ae4fb6a5f Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:28:44 +0100 Subject: [PATCH 059/117] Update LICENSE date --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 70cebc1..b812c0c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ *============================================================================== *------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Copyright 2016: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC * Based on John D. McCalpin’s original STREAM benchmark for CPUs *------------------------------------------------------------------------------ * License: From 530b2adda2e0066e20850c97e7fede9795b65944 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 12:32:03 +0100 Subject: [PATCH 060/117] Add License text to all files --- CUDAStream.cu | 7 +++++++ CUDAStream.h | 8 ++++++++ LICENSE | 2 +- OCLStream.cpp | 6 ++++++ OCLStream.h | 6 ++++++ Stream.h | 6 ++++++ common.h.in | 6 ++++++ main.cpp | 6 ++++++ 8 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 956be7d..3c10e8d 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -1,4 +1,11 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + #include "CUDAStream.h" void check_error(void) diff --git a/CUDAStream.h b/CUDAStream.h index 61e4882..9c436d6 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -1,4 +1,12 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + #include #include diff --git a/LICENSE b/LICENSE index b812c0c..1bc1114 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ *============================================================================== *------------------------------------------------------------------------------ -* Copyright 2016: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC * Based on John D. McCalpin’s original STREAM benchmark for CPUs *------------------------------------------------------------------------------ * License: diff --git a/OCLStream.cpp b/OCLStream.cpp index 5a72b3c..f7c538e 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #include "OCLStream.h" // Cache list of devices diff --git a/OCLStream.h b/OCLStream.h index 79a213f..cb48da5 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #pragma once #include diff --git a/Stream.h b/Stream.h index e02c953..671289e 100644 --- a/Stream.h +++ b/Stream.h @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #pragma once #include diff --git a/common.h.in b/common.h.in index fbf953c..1b0f38b 100644 --- a/common.h.in +++ b/common.h.in @@ -1,3 +1,9 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" diff --git a/main.cpp b/main.cpp index 457ca9a..96f4e5c 100644 --- a/main.cpp +++ b/main.cpp @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #include #include #include From 1a38b189542a97aff9e0b062e803ec5838364255 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 14:36:08 +0100 Subject: [PATCH 061/117] Add OpenACC version --- ACCStream.cpp | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++ ACCStream.h | 39 +++++++++++++++++ main.cpp | 7 ++++ 3 files changed, 159 insertions(+) create mode 100644 ACCStream.cpp create mode 100644 ACCStream.h diff --git a/ACCStream.cpp b/ACCStream.cpp new file mode 100644 index 0000000..ca5ef7b --- /dev/null +++ b/ACCStream.cpp @@ -0,0 +1,113 @@ + +#include "ACCStream.h" + +template +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +ACCStream::~ACCStream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::copy() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void ACCStream::mul() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void ACCStream::add() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void ACCStream::triad() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices +} + +template class ACCStream; +template class ACCStream; + diff --git a/ACCStream.h b/ACCStream.h new file mode 100644 index 0000000..4c69986 --- /dev/null +++ b/ACCStream.h @@ -0,0 +1,39 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenACC" + +template +class ACCStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + ACCStream(const unsigned int, T*, T*, T*); + ~ACCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; + diff --git a/main.cpp b/main.cpp index 96f4e5c..209fc98 100644 --- a/main.cpp +++ b/main.cpp @@ -22,6 +22,9 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" +#elif defined(ACC) +#include "ACCStream.h" + #endif @@ -80,6 +83,10 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(ACC) + // Use the OpenACC implementation + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); + #endif stream->write_arrays(a, b, c); From da4f91878834f3e2e3b907003eefcb77538e0f9b Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:45:13 +0100 Subject: [PATCH 062/117] Add initial SYCL implementation --- CMakeLists.txt | 8 ++++++++ main.cpp | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dda6cd..d43a22a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,3 +30,11 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +# TODO: Find SYCL implementations somehow +if (true) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) +else () + message("Skipping SYCL...") +endif () diff --git a/main.cpp b/main.cpp index 209fc98..f12a7e1 100644 --- a/main.cpp +++ b/main.cpp @@ -24,7 +24,8 @@ #include "OCLStream.h" #elif defined(ACC) #include "ACCStream.h" - +#elif defined(SYCL) +#include "SYCLStream.h" #endif @@ -87,6 +88,10 @@ void run() // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); +#elif defined(SYCL) + // Use the SYCL implementation + stream = new SYCLStream(ARRAY_SIZE, deviceIndex); + #endif stream->write_arrays(a, b, c); @@ -285,4 +290,3 @@ void parseArguments(int argc, char *argv[]) } } } - From 40a0a6551dabbe7a810639275aa60ca9e64ae343 Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:46:08 +0100 Subject: [PATCH 063/117] Remove extra -std=c++11 from CMake build --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d43a22a..fd9a10d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,6 @@ project(gpu-stream) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) -list(APPEND CMAKE_CXX_FLAGS --std=c++11) - configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) From b45f311e0d4e6f3303e78c5327d86b59f9827dea Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:48:35 +0100 Subject: [PATCH 064/117] Add missing SYCL source files --- SYCLStream.cpp | 144 +++++++++++++++++++++++++++++++++++++++++++++++++ SYCLStream.h | 45 ++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 SYCLStream.cpp create mode 100644 SYCLStream.h diff --git a/SYCLStream.cpp b/SYCLStream.cpp new file mode 100644 index 0000000..1314aa4 --- /dev/null +++ b/SYCLStream.cpp @@ -0,0 +1,144 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +using namespace cl::sycl; + +template +SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + array_size = ARRAY_SIZE; + + // Create buffers + d_a = buffer(array_size); + d_b = buffer(array_size); + d_c = buffer(array_size); +} + +template +SYCLStream::~SYCLStream() +{ +} + +template +void SYCLStream::copy() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kb[index] = scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::add() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index] + kb[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ + ka[index] = kb[index] + scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + _a[i] = a[i]; + _b[i] = b[i]; + _c[i] = c[i]; + } +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void listDevices(void) +{ + // TODO: Get actual list of devices + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + std::cout << "0: " << "triSYCL" << std::endl; + std::cout << std::endl; +} + +std::string getDeviceName(const int device) +{ + // TODO: Implement properly + return "triSYCL"; +} + +std::string getDeviceDriver(const int device) +{ + // TODO: Implement properly + return "triSCYL"; +} + + +template class SYCLStream; +template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h new file mode 100644 index 0000000..e10535e --- /dev/null +++ b/SYCLStream.h @@ -0,0 +1,45 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include "Stream.h" + +#include "CL/sycl.hpp" + +#define IMPLEMENTATION_STRING "SYCL" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // SYCL objects + cl::sycl::queue queue; + cl::sycl::buffer d_a; + cl::sycl::buffer d_b; + cl::sycl::buffer d_c; + + public: + + SYCLStream(const unsigned int, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); From 0b0de4e0c3c17079d9b9cac3ab64ac0bb09326ff Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 14:50:09 +0100 Subject: [PATCH 065/117] Implement the OpenACC device string functions, and device selector --- ACCStream.cpp | 25 ++++++++++++++++++++++++- ACCStream.h | 2 +- main.cpp | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index ca5ef7b..85bf600 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -2,8 +2,11 @@ #include "ACCStream.h" template -ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) { + + acc_set_device_num(device, acc_device_nvidia); + array_size = ARRAY_SIZE; // Set up data region on device @@ -106,8 +109,28 @@ void ACCStream::triad() void listDevices(void) { // Get number of devices + int count = acc_get_num_devices(acc_device_nvidia); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } } +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} template class ACCStream; template class ACCStream; diff --git a/ACCStream.h b/ACCStream.h index 4c69986..d6e5728 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -22,7 +22,7 @@ class ACCStream : public Stream T *c; public: - ACCStream(const unsigned int, T*, T*, T*); + ACCStream(const unsigned int, T*, T*, T*, int); ~ACCStream(); virtual void copy() override; diff --git a/main.cpp b/main.cpp index f12a7e1..59411b8 100644 --- a/main.cpp +++ b/main.cpp @@ -86,7 +86,7 @@ void run() #elif defined(ACC) // Use the OpenACC implementation - stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #elif defined(SYCL) // Use the SYCL implementation From 31819b7778fd4fd7f99c3450b73e6b5f13630629 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 15:07:51 +0100 Subject: [PATCH 066/117] Add bones of OpenACC in CMake config --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd9a10d..1a15bc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,15 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +# TODO: Find OpenACC implementations somehow +if (true) + add_executable(gpu-stream-acc main.cpp ACCStream.cpp) + target_compile_definitions(gpu-stream-acc PUBLIC ACC) + target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") +else () + message("Skipping OpenACC...") +endif () + # TODO: Find SYCL implementations somehow if (true) add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) From 48cae0cbc1bd0bc667f87f43f19978214b81efe1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 15:20:26 +0100 Subject: [PATCH 067/117] Make sure CUDA nvcc builds with C++11 --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a15bc0..71a95d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) if (${CUDA_FOUND}) + list(APPEND CUDA_NVCC_FLAGS --std=c++11) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) else (${CUDA_FOUND}) From f0afa0c1e43c43729dd26cca719d1db21844d279 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 4 May 2016 10:41:41 +0100 Subject: [PATCH 068/117] Add reference OpenMP 3.0 version --- OMP3Stream.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ OMP3Stream.h | 35 ++++++++++++++++ main.cpp | 7 ++++ 3 files changed, 148 insertions(+) create mode 100644 OMP3Stream.cpp create mode 100644 OMP3Stream.h diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp new file mode 100644 index 0000000..8899cff --- /dev/null +++ b/OMP3Stream.cpp @@ -0,0 +1,106 @@ + +#include "OMP3Stream.h" + +template +OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +} + +template +OMP3Stream::~OMP3Stream() +{ + free(a); + free(b); + free(c); +} + + +template +void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = h_a[i]; + b[i] = h_b[i]; + c[i] = h_c[i]; + } +} + +template +void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +template +void OMP3Stream::copy() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP3Stream::mul() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP3Stream::add() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP3Stream::triad() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + + +template class OMP3Stream; +template class OMP3Stream; + diff --git a/OMP3Stream.h b/OMP3Stream.h new file mode 100644 index 0000000..15172c3 --- /dev/null +++ b/OMP3Stream.h @@ -0,0 +1,35 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Reference OpenMP" + +template +class OMP3Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP3Stream(const unsigned int, T*, T*, T*); + ~OMP3Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/main.cpp b/main.cpp index 59411b8..18b8c72 100644 --- a/main.cpp +++ b/main.cpp @@ -26,6 +26,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" +#elif defined(OMP3) +#include "OMP3Stream.h" #endif @@ -92,6 +94,11 @@ void run() // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(OMP3) + // Use the "reference" OpenMP 3 implementation + stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); + + #endif stream->write_arrays(a, b, c); From 7c28a6386b1905e154815afcfc0f26e1458aaa3d Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Thu, 5 May 2016 17:22:29 +0100 Subject: [PATCH 069/117] Added the Kokkos and RAJA implementations --- main.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 96f4e5c..1a92ebc 100644 --- a/main.cpp +++ b/main.cpp @@ -22,9 +22,12 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" +#elif defined(RAJA) +#include "RAJAStream.hpp" +#elif defined(KOKKOS) +#include "KOKKOSStream.hpp" #endif - unsigned int ARRAY_SIZE = 52428800; unsigned int num_times = 10; unsigned int deviceIndex = 0; @@ -80,6 +83,14 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(RAJA) + // Use the RAJA implementation + stream = new RAJAStream(ARRAY_SIZE, deviceIndex); + +#elif defined(KOKKOS) + // Use the Kokkos implementation + stream = new KOKKOSStream(ARRAY_SIZE, deviceIndex); + #endif stream->write_arrays(a, b, c); From d4b3b3533ce2594eec92d85a937081ba15a17734 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 6 May 2016 00:38:30 +0100 Subject: [PATCH 070/117] Update SYCL version to work with ComputeCpp Still needs proper CMake rules and kernel names need to be fixed for multiple template instantiations. --- CMakeLists.txt | 15 ++++++++++++--- SYCLStream.cpp | 52 +++++++++++++++++++++++++++----------------------- SYCLStream.h | 6 +++--- main.cpp | 3 +++ 4 files changed, 46 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 71a95d6..b078ab5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,9 +40,18 @@ endif () # TODO: Find SYCL implementations somehow if (true) - add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) - target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) - set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) + # ComputeCpp + # TODO: Sort this out properly! + add_custom_target(gpu-stream-sycl + COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -O2 -emit-llvm -o SYCLStream.bc -c + COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c + COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL + ) + + # triSYCL + #add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + #target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + #set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) else () message("Skipping SYCL...") endif () diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 1314aa4..1c4ed17 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -17,14 +17,17 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Create buffers - d_a = buffer(array_size); - d_b = buffer(array_size); - d_c = buffer(array_size); + d_a = new buffer(array_size); + d_b = new buffer(array_size); + d_c = new buffer(array_size); } template SYCLStream::~SYCLStream() { + delete d_a; + delete d_b; + delete d_c; } template @@ -32,9 +35,9 @@ void SYCLStream::copy() { queue.submit([&](handler &cgh) { - auto ka = d_a.template get_access(cgh); - auto kc = d_c.template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + auto ka = d_a->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) { kc[index] = ka[index]; }); @@ -48,9 +51,9 @@ void SYCLStream::mul() const T scalar = 3.0; queue.submit([&](handler &cgh) { - auto kb = d_b.template get_access(cgh); - auto kc = d_c.template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) { kb[index] = scalar * kc[index]; }); @@ -63,10 +66,10 @@ void SYCLStream::add() { queue.submit([&](handler &cgh) { - auto ka = d_a.template get_access(cgh); - auto kb = d_b.template get_access(cgh); - auto kc = d_c.template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) { kc[index] = ka[index] + kb[index]; }); @@ -80,10 +83,10 @@ void SYCLStream::triad() const T scalar = 3.0; queue.submit([&](handler &cgh) { - auto ka = d_a.template get_access(cgh); - auto kb = d_b.template get_access(cgh); - auto kc = d_c.template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ ka[index] = kb[index] + scalar * kc[index]; }); }); @@ -93,9 +96,9 @@ void SYCLStream::triad() template void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { - auto _a = d_a.template get_access(); - auto _b = d_b.template get_access(); - auto _c = d_c.template get_access(); + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); for (int i = 0; i < array_size; i++) { _a[i] = a[i]; @@ -107,9 +110,9 @@ void SYCLStream::write_arrays(const std::vector& a, const std::vector& template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - auto _a = d_a.template get_access(); - auto _b = d_b.template get_access(); - auto _c = d_c.template get_access(); + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); for (int i = 0; i < array_size; i++) { a[i] = _a[i]; @@ -140,5 +143,6 @@ std::string getDeviceDriver(const int device) } -template class SYCLStream; +// TODO: Fix kernel names to allow multiple template specializations +//template class SYCLStream; template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h index e10535e..f4d79d5 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -22,9 +22,9 @@ class SYCLStream : public Stream // SYCL objects cl::sycl::queue queue; - cl::sycl::buffer d_a; - cl::sycl::buffer d_b; - cl::sycl::buffer d_c; + cl::sycl::buffer *d_a; + cl::sycl::buffer *d_b; + cl::sycl::buffer *d_c; public: diff --git a/main.cpp b/main.cpp index 18b8c72..e0911d2 100644 --- a/main.cpp +++ b/main.cpp @@ -54,9 +54,12 @@ int main(int argc, char *argv[]) parseArguments(argc, argv); + // TODO: Fix SYCL to allow multiple template specializations +#ifndef SYCL if (use_float) run(); else +#endif run(); } From 45381da0b2cc473ad6b562394c422a3176b21f83 Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 10:46:35 +0100 Subject: [PATCH 071/117] Initial commit of in progress developments of RAJA and KOKKOS stream --- KOKKOSStream.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++ KOKKOSStream.hpp | 53 ++++++++++++++++++++ RAJAStream.cpp | 105 ++++++++++++++++++++++++++++++++++++++++ RAJAStream.hpp | 58 ++++++++++++++++++++++ 4 files changed, 339 insertions(+) create mode 100644 KOKKOSStream.cpp create mode 100644 KOKKOSStream.hpp create mode 100644 RAJAStream.cpp create mode 100644 RAJAStream.hpp diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp new file mode 100644 index 0000000..c4d548b --- /dev/null +++ b/KOKKOSStream.cpp @@ -0,0 +1,123 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "KOKKOSStream.hpp" + +using Kokkos::parallel_for; + +template +KOKKOSStream::KOKKOSStream( + const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + Kokkos::initialize(); + + new(d_a) Kokkos::View("d_a", ARRAY_SIZE); + new(d_b) Kokkos::View("d_b", ARRAY_SIZE); + new(d_c) Kokkos::View("d_c", ARRAY_SIZE); + new(hm_a) Kokkos::View::HostMirror(); + new(hm_b) Kokkos::View::HostMirror(); + new(hm_c) Kokkos::View::HostMirror(); + hm_a = Kokkos::create_mirror_view(d_a); + hm_b = Kokkos::create_mirror_view(d_b); + hm_c = Kokkos::create_mirror_view(d_c); +} + +template +KOKKOSStream::~KOKKOSStream() +{ + Kokkos::finalize(); +} + +template +void KOKKOSStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + for(int ii = 0; ii < array_size; ++ii) + { + hm_a(ii) = a[ii]; + hm_b(ii) = b[ii]; + hm_c(ii) = c[ii]; + } + Kokkos::deep_copy(hm_a, d_a); + Kokkos::deep_copy(hm_b, d_b); + Kokkos::deep_copy(hm_c, d_c); +} + +template +void KOKKOSStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + Kokkos::deep_copy(d_a, hm_a); + Kokkos::deep_copy(d_a, hm_b); + Kokkos::deep_copy(d_a, hm_c); + for(int ii = 0; ii < array_size; ++ii) + { + a[ii] = hm_a(ii); + b[ii] = hm_b(ii); + c[ii] = hm_c(ii); + } +} + +template +void KOKKOSStream::copy() +{ + Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_c[index] = d_a[index]; + }); +} + +template +void KOKKOSStream::mul() +{ + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_b[index] = scalar*d_c[index]; + }); +} + +template +void KOKKOSStream::add() +{ + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_c[index] = d_a[index] + d_b[index]; + }); +} + +template +void KOKKOSStream::triad() +{ + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_a[index] = d_b[index] + scalar*d_c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "Kokkos"; +} + + +std::string getDeviceDriver(const int device) +{ + return "Kokkos"; +} + +template class KOKKOSStream; +template class KOKKOSStream; + diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp new file mode 100644 index 0000000..632ca20 --- /dev/null +++ b/KOKKOSStream.hpp @@ -0,0 +1,53 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "KOKKOS" + +#define DEVICE Kokkos::OpenMP + + +template +class KOKKOSStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + Kokkos::View d_a; + Kokkos::View d_b; + Kokkos::View d_c; + Kokkos::View::HostMirror hm_a; + Kokkos::View::HostMirror hm_b; + Kokkos::View::HostMirror hm_c; + + public: + + KOKKOSStream(const unsigned int, const int); + ~KOKKOSStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/RAJAStream.cpp b/RAJAStream.cpp new file mode 100644 index 0000000..5b1c980 --- /dev/null +++ b/RAJAStream.cpp @@ -0,0 +1,105 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "RAJAStream.hpp" + +using RAJA::forall; +using RAJA::RangeSegment; + +template +RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + RangeSegment seg(0, ARRAY_SIZE); + index_set.push_back(seg); + d_a = new T[ARRAY_SIZE]; + d_b = new T[ARRAY_SIZE]; + d_c = new T[ARRAY_SIZE]; +} + +template +RAJAStream::~RAJAStream() +{ + delete[] d_a; + delete[] d_b; + delete[] d_c; +} + +template +void RAJAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + std::copy(a.begin(), a.end(), d_a); + std::copy(b.begin(), b.end(), d_b); + std::copy(c.begin(), c.end(), d_c); +} + +template +void RAJAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + std::copy(d_a, d_a + array_size - 1, a.data()); + std::copy(d_b, d_b + array_size - 1, b.data()); + std::copy(d_c, d_c + array_size - 1, c.data()); +} + +template +void RAJAStream::copy() +{ + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_c[index] = d_a[index]; + }); +} + +template +void RAJAStream::mul() +{ + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_b[index] = scalar*d_c[index]; + }); +} + +template +void RAJAStream::add() +{ + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_c[index] = d_a[index] + d_b[index]; + }); +} + +template +void RAJAStream::triad() +{ + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_a[index] = d_b[index] + scalar*d_c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "RAJA"; +} + + +std::string getDeviceDriver(const int device) +{ + return "RAJA"; +} + +template class RAJAStream; +template class RAJAStream; + diff --git a/RAJAStream.hpp b/RAJAStream.hpp new file mode 100644 index 0000000..a41c60e --- /dev/null +++ b/RAJAStream.hpp @@ -0,0 +1,58 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "RAJA/RAJA.hxx" + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "RAJA" + +#ifdef RAJA_USE_CUDA +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec_async> policy; +#else +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::omp_parallel_for_exec> policy; +#endif + +template +class RAJAStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Contains iteration space + RAJA::IndexSet index_set; + + // Device side pointers to arrays + T* d_a; + T* d_b; + T* d_c; + + public: + + RAJAStream(const unsigned int, const int); + ~RAJAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + From 3b266b826610abbc8dd3ea3f94bf076a30a758ef Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 10:51:35 +0100 Subject: [PATCH 072/117] Fix for namespace collision with #define RAJA --- main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.cpp b/main.cpp index 5d3a559..f67fdd6 100644 --- a/main.cpp +++ b/main.cpp @@ -22,7 +22,7 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" -#elif defined(RAJA) +#elif defined(USE_RAJA) #include "RAJAStream.hpp" #elif defined(KOKKOS) #include "KOKKOSStream.hpp" @@ -89,7 +89,7 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); -#elif defined(RAJA) +#elif defined(USE_RAJA) // Use the RAJA implementation stream = new RAJAStream(ARRAY_SIZE, deviceIndex); From 1a60f130eba160091f5cd5915caeca9ed4e536f9 Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 13:17:04 +0100 Subject: [PATCH 073/117] Fixed memory management for GPU, now working with OpenMP and CUDA --- RAJAStream.cpp | 44 +++++++++++++++++++++++++++++++++++--------- RAJAStream.hpp | 12 ++++++------ 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 5b1c980..eb98d54 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -16,21 +16,36 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) { RangeSegment seg(0, ARRAY_SIZE); index_set.push_back(seg); + +#ifdef RAJA_TARGET_CPU d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; +#else + cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaDeviceSynchronize(); +#endif } template RAJAStream::~RAJAStream() { +#ifdef RAJA_TARGET_CPU delete[] d_a; delete[] d_b; delete[] d_c; +#else + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); +#endif } template -void RAJAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) { std::copy(a.begin(), a.end(), d_a); std::copy(b.begin(), b.end(), d_b); @@ -38,48 +53,59 @@ void RAJAStream::write_arrays(const std::vector& a, const std::vector& } template -void RAJAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void RAJAStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) { - std::copy(d_a, d_a + array_size - 1, a.data()); - std::copy(d_b, d_b + array_size - 1, b.data()); - std::copy(d_c, d_c + array_size - 1, c.data()); + std::copy(d_a, d_a + array_size, a.data()); + std::copy(d_b, d_b + array_size, b.data()); + std::copy(d_c, d_c + array_size, c.data()); } template void RAJAStream::copy() { + T* a = d_a; + T* c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { - d_c[index] = d_a[index]; + c[index] = a[index]; }); } template void RAJAStream::mul() { + T* b = d_b; + T* c = d_c; const T scalar = 3.0; forall(index_set, [=] RAJA_DEVICE (int index) { - d_b[index] = scalar*d_c[index]; + b[index] = scalar*c[index]; }); } template void RAJAStream::add() { + T* a = d_a; + T* b = d_b; + T* c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { - d_c[index] = d_a[index] + d_b[index]; + c[index] = a[index] + b[index]; }); } template void RAJAStream::triad() { + T* a = d_a; + T* b = d_b; + T* c = d_c; const T scalar = 3.0; forall(index_set, [=] RAJA_DEVICE (int index) { - d_a[index] = d_b[index] + scalar*d_c[index]; + a[index] = b[index] + scalar*c[index]; }); } diff --git a/RAJAStream.hpp b/RAJAStream.hpp index a41c60e..454e20e 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -14,15 +14,15 @@ #define IMPLEMENTATION_STRING "RAJA" -#ifdef RAJA_USE_CUDA -const size_t block_size = 128; -typedef RAJA::IndexSet::ExecPolicy< - RAJA::seq_segit, - RAJA::cuda_exec_async> policy; -#else +#ifdef RAJA_TARGET_CPU typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::omp_parallel_for_exec> policy; +#else +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec> policy; #endif template From 894829cb05143c908923a831e9f0d13072ef640a Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:02:44 +0100 Subject: [PATCH 074/117] Adjusted the Kokkos implementation to fix view initialisation, and store local copies of views for lambda scoping --- KOKKOSStream.cpp | 74 +++++++++++++++++++++++++++++------------------- KOKKOSStream.hpp | 19 +++++++------ main.cpp | 2 ++ 3 files changed, 58 insertions(+), 37 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index c4d548b..3834081 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -7,7 +7,7 @@ #include "KOKKOSStream.hpp" -using Kokkos::parallel_for; +using namespace Kokkos; template KOKKOSStream::KOKKOSStream( @@ -16,21 +16,21 @@ KOKKOSStream::KOKKOSStream( { Kokkos::initialize(); - new(d_a) Kokkos::View("d_a", ARRAY_SIZE); - new(d_b) Kokkos::View("d_b", ARRAY_SIZE); - new(d_c) Kokkos::View("d_c", ARRAY_SIZE); - new(hm_a) Kokkos::View::HostMirror(); - new(hm_b) Kokkos::View::HostMirror(); - new(hm_c) Kokkos::View::HostMirror(); - hm_a = Kokkos::create_mirror_view(d_a); - hm_b = Kokkos::create_mirror_view(d_b); - hm_c = Kokkos::create_mirror_view(d_c); + d_a = new View("d_a", ARRAY_SIZE); + d_b = new View("d_b", ARRAY_SIZE); + d_c = new View("d_c", ARRAY_SIZE); + hm_a = new View::HostMirror(); + hm_b = new View::HostMirror(); + hm_c = new View::HostMirror(); + *hm_a = create_mirror_view(*d_a); + *hm_b = create_mirror_view(*d_b); + *hm_c = create_mirror_view(*d_c); } template KOKKOSStream::~KOKKOSStream() { - Kokkos::finalize(); + finalize(); } template @@ -39,65 +39,81 @@ void KOKKOSStream::write_arrays( { for(int ii = 0; ii < array_size; ++ii) { - hm_a(ii) = a[ii]; - hm_b(ii) = b[ii]; - hm_c(ii) = c[ii]; + (*hm_a)(ii) = a[ii]; + (*hm_b)(ii) = b[ii]; + (*hm_c)(ii) = c[ii]; } - Kokkos::deep_copy(hm_a, d_a); - Kokkos::deep_copy(hm_b, d_b); - Kokkos::deep_copy(hm_c, d_c); + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); } template void KOKKOSStream::read_arrays( std::vector& a, std::vector& b, std::vector& c) { - Kokkos::deep_copy(d_a, hm_a); - Kokkos::deep_copy(d_a, hm_b); - Kokkos::deep_copy(d_a, hm_c); + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); for(int ii = 0; ii < array_size; ++ii) { - a[ii] = hm_a(ii); - b[ii] = hm_b(ii); - c[ii] = hm_c(ii); + a[ii] = (*hm_a)(ii); + b[ii] = (*hm_b)(ii); + c[ii] = (*hm_c)(ii); } } template void KOKKOSStream::copy() { - Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_c[index] = d_a[index]; + c[index] = a[index]; }); } template void KOKKOSStream::mul() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + const T scalar = 3.0; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_b[index] = scalar*d_c[index]; + b[index] = scalar*c[index]; }); } template void KOKKOSStream::add() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_c[index] = d_a[index] + d_b[index]; + c[index] = a[index] + b[index]; }); } template void KOKKOSStream::triad() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + const T scalar = 3.0; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_a[index] = d_b[index] + scalar*d_c[index]; + a[index] = b[index] + scalar*c[index]; }); } @@ -118,6 +134,6 @@ std::string getDeviceDriver(const int device) return "Kokkos"; } -template class KOKKOSStream; +//template class KOKKOSStream; template class KOKKOSStream; diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index 632ca20..d2b9665 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -17,8 +17,11 @@ #define IMPLEMENTATION_STRING "KOKKOS" -#define DEVICE Kokkos::OpenMP - +#ifdef KOKKOS_TARGET_CPU + #define DEVICE Kokkos::OpenMP +#else + #define DEVICE Kokkos::Cuda +#endif template class KOKKOSStream : public Stream @@ -28,12 +31,12 @@ class KOKKOSStream : public Stream unsigned int array_size; // Device side pointers to arrays - Kokkos::View d_a; - Kokkos::View d_b; - Kokkos::View d_c; - Kokkos::View::HostMirror hm_a; - Kokkos::View::HostMirror hm_b; - Kokkos::View::HostMirror hm_c; + Kokkos::View* d_a; + Kokkos::View* d_b; + Kokkos::View* d_c; + Kokkos::View::HostMirror* hm_a; + Kokkos::View::HostMirror* hm_b; + Kokkos::View::HostMirror* hm_c; public: diff --git a/main.cpp b/main.cpp index 4794f9b..007ab7f 100644 --- a/main.cpp +++ b/main.cpp @@ -59,9 +59,11 @@ int main(int argc, char *argv[]) // TODO: Fix SYCL to allow multiple template specializations #ifndef SYCL +#ifndef KOKKOS if (use_float) run(); else +#endif #endif run(); From 0f0454ec295b3c9fe2ada150534fb5d2e332ccce Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:05:20 +0100 Subject: [PATCH 075/117] Added CUDA device syncs to force proper timing --- KOKKOSStream.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 3834081..0c3f44c 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -74,6 +74,7 @@ void KOKKOSStream::copy() { c[index] = a[index]; }); + cudaDeviceSynchronize(); } template @@ -88,6 +89,7 @@ void KOKKOSStream::mul() { b[index] = scalar*c[index]; }); + cudaDeviceSynchronize(); } template @@ -101,6 +103,8 @@ void KOKKOSStream::add() { c[index] = a[index] + b[index]; }); + + cudaDeviceSynchronize(); } template @@ -115,6 +119,8 @@ void KOKKOSStream::triad() { a[index] = b[index] + scalar*c[index]; }); + + cudaDeviceSynchronize(); } void listDevices(void) From 6e9b85bb26221114c18f53606aed5ae6fd387c0a Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:08:23 +0100 Subject: [PATCH 076/117] Fixed deep copy ordering, which was reversed --- KOKKOSStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 0c3f44c..d93b6d7 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -43,18 +43,18 @@ void KOKKOSStream::write_arrays( (*hm_b)(ii) = b[ii]; (*hm_c)(ii) = c[ii]; } - deep_copy(*hm_a, *d_a); - deep_copy(*hm_b, *d_b); - deep_copy(*hm_c, *d_c); + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); } template void KOKKOSStream::read_arrays( std::vector& a, std::vector& b, std::vector& c) { - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); for(int ii = 0; ii < array_size; ++ii) { a[ii] = (*hm_a)(ii); From fb8f06e683ab995e686e8300b17e55fad57ceb51 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 6 May 2016 22:33:18 +0100 Subject: [PATCH 077/117] [SYCL] Pass -no-serial-memop to compute++ to squelch warning --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b078ab5..64bfd2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,7 @@ if (true) # ComputeCpp # TODO: Sort this out properly! add_custom_target(gpu-stream-sycl - COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -O2 -emit-llvm -o SYCLStream.bc -c + COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -o SYCLStream.bc -c COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL ) From 54834e05f4b855451b3f266aafd21420441f17c7 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 6 May 2016 22:41:10 +0100 Subject: [PATCH 078/117] [SYCL] Use nd_range instead of range to specify work-group size --- SYCLStream.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 1c4ed17..fc76a82 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -11,6 +11,8 @@ using namespace cl::sycl; +#define WGSIZE 64 + template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) { @@ -37,9 +39,9 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) { - kc[index] = ka[index]; + kc[item.get_global()] = ka[item.get_global()]; }); }); queue.wait(); @@ -53,9 +55,9 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) { - kb[index] = scalar * kc[index]; + kb[item.get_global()] = scalar * kc[item.get_global()]; }); }); queue.wait(); @@ -69,9 +71,9 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) { - kc[index] = ka[index] + kb[index]; + kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()]; }); }); queue.wait(); @@ -86,8 +88,9 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ - ka[index] = kb[index] + scalar * kc[index]; + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + { + ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()]; }); }); queue.wait(); From 3b3f6dfc260f9fb0928753dd699435ae5f45cc80 Mon Sep 17 00:00:00 2001 From: James Price Date: Sun, 8 May 2016 19:22:09 +0100 Subject: [PATCH 079/117] [SYCL] Implement device list/selection functionality --- SYCLStream.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index fc76a82..c6947e7 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -13,11 +13,20 @@ using namespace cl::sycl; #define WGSIZE 64 +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) { array_size = ARRAY_SIZE; + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); @@ -124,25 +133,78 @@ void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vecto } } +void getDeviceList(void) +{ + // Get list of platforms + std::vector platforms = platform::get_platforms(); + + // Enumerate devices + for (unsigned i = 0; i < platforms.size(); i++) + { + std::vector plat_devices = platforms[i].get_devices(); + devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); + } + cached = true; +} + void listDevices(void) { - // TODO: Get actual list of devices - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - std::cout << "0: " << "triSYCL" << std::endl; - std::cout << std::endl; + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } } std::string getDeviceName(const int device) { - // TODO: Implement properly - return "triSYCL"; + if (!cached) + getDeviceList(); + + std::string name; + cl_device_info info = CL_DEVICE_NAME; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; } std::string getDeviceDriver(const int device) { - // TODO: Implement properly - return "triSCYL"; + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; } From 6d913bab4b16c688fbe3c3cf804af4a374810543 Mon Sep 17 00:00:00 2001 From: James Price Date: Sun, 8 May 2016 21:35:24 +0100 Subject: [PATCH 080/117] [SYCL] Actually use device_index to select device --- SYCLStream.cpp | 27 +++++++++++++++++++-------- SYCLStream.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index c6947e7..0e00c53 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -21,12 +21,21 @@ void getDeviceList(void); template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) { + if (!cached) + getDeviceList(); + array_size = ARRAY_SIZE; + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + device dev = devices[device_index]; + // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + queue = new cl::sycl::queue(dev); + // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); @@ -39,12 +48,14 @@ SYCLStream::~SYCLStream() delete d_a; delete d_b; delete d_c; + + delete queue; } template void SYCLStream::copy() { - queue.submit([&](handler &cgh) + queue->submit([&](handler &cgh) { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); @@ -53,14 +64,14 @@ void SYCLStream::copy() kc[item.get_global()] = ka[item.get_global()]; }); }); - queue.wait(); + queue->wait(); } template void SYCLStream::mul() { const T scalar = 3.0; - queue.submit([&](handler &cgh) + queue->submit([&](handler &cgh) { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); @@ -69,13 +80,13 @@ void SYCLStream::mul() kb[item.get_global()] = scalar * kc[item.get_global()]; }); }); - queue.wait(); + queue->wait(); } template void SYCLStream::add() { - queue.submit([&](handler &cgh) + queue->submit([&](handler &cgh) { auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); @@ -85,14 +96,14 @@ void SYCLStream::add() kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()]; }); }); - queue.wait(); + queue->wait(); } template void SYCLStream::triad() { const T scalar = 3.0; - queue.submit([&](handler &cgh) + queue->submit([&](handler &cgh) { auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); @@ -102,7 +113,7 @@ void SYCLStream::triad() ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()]; }); }); - queue.wait(); + queue->wait(); } template diff --git a/SYCLStream.h b/SYCLStream.h index f4d79d5..4c0c681 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -21,7 +21,7 @@ class SYCLStream : public Stream unsigned int array_size; // SYCL objects - cl::sycl::queue queue; + cl::sycl::queue *queue; cl::sycl::buffer *d_a; cl::sycl::buffer *d_b; cl::sycl::buffer *d_c; From 084d7417b9cd961683a5c9aae608366c4421b234 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 9 May 2016 15:20:11 +0100 Subject: [PATCH 081/117] [SYCL] Remove unneeded cl_device_info line --- SYCLStream.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 0e00c53..e8eee42 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -185,7 +185,6 @@ std::string getDeviceName(const int device) getDeviceList(); std::string name; - cl_device_info info = CL_DEVICE_NAME; if (device < devices.size()) { From 5c8b07262bb8c1813903846cb4a211c37b6d3dd9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 11:49:44 +0100 Subject: [PATCH 082/117] Default to 100 iterations to get over any warm up times --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 007ab7f..4fb35c3 100644 --- a/main.cpp +++ b/main.cpp @@ -35,7 +35,7 @@ #endif unsigned int ARRAY_SIZE = 52428800; -unsigned int num_times = 10; +unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; From 3227e5dbf00377f6c104bd5c8e5d7af3d4820ef0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 11:52:17 +0100 Subject: [PATCH 083/117] Print out data type for float or double --- main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.cpp b/main.cpp index 4fb35c3..c31df9c 100644 --- a/main.cpp +++ b/main.cpp @@ -72,6 +72,11 @@ int main(int argc, char *argv[]) template void run() { + if (sizeof(T) == sizeof(float)) + std::cout << "Precision: float" << std::endl; + else + std::cout << "Precision: double" << std::endl; + // Create host vectors std::vector a(ARRAY_SIZE, 1.0); std::vector b(ARRAY_SIZE, 2.0); From 75ef78495c6c7d2b872a2e3ea4a60b3e58e4d796 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 11:53:51 +0100 Subject: [PATCH 084/117] Add print out of number of iterations --- main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.cpp b/main.cpp index c31df9c..3a516de 100644 --- a/main.cpp +++ b/main.cpp @@ -72,6 +72,8 @@ int main(int argc, char *argv[]) template void run() { + std::cout << "Running kernels " << num_times << " times" << std::endl; + if (sizeof(T) == sizeof(float)) std::cout << "Precision: float" << std::endl; else From 0f8f191d0e573d3f453b9d46b4c8363e5bb52659 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 11:55:33 +0100 Subject: [PATCH 085/117] Require number of iterations to be at least 2 --- main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.cpp b/main.cpp index 3a516de..43cfb7c 100644 --- a/main.cpp +++ b/main.cpp @@ -290,6 +290,11 @@ void parseArguments(int argc, char *argv[]) std::cerr << "Invalid number of times." << std::endl; exit(EXIT_FAILURE); } + if (num_times < 2) + { + std::cerr << "Number of times must be 2 or more" << std::endl; + exit(EXIT_FAILURE); + } } else if (!std::string("--float").compare(argv[i])) { From 207fd8f784072e137dccd9e3ea8ac435dd9d907c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 12:04:19 +0100 Subject: [PATCH 086/117] Default to power of two array size --- main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 43cfb7c..832c657 100644 --- a/main.cpp +++ b/main.cpp @@ -34,7 +34,8 @@ #include "OMP3Stream.h" #endif -unsigned int ARRAY_SIZE = 52428800; +// Default size of 2^26 +unsigned int ARRAY_SIZE = 67108864; unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; From 2462023ed9f8ecdbda1c70074a6b18337693ee51 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 12:21:29 +0100 Subject: [PATCH 087/117] Set thread block size in CUDA with a #define, and check that array size is multiple of it --- CUDAStream.cu | 19 ++++++++++++++----- CUDAStream.h | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 3c10e8d..2a28f9c 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -8,6 +8,8 @@ #include "CUDAStream.h" +#define TBSIZE 1024 + void check_error(void) { cudaError_t err = cudaGetLastError(); @@ -22,6 +24,14 @@ template CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) { + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + // Set device int count; cudaGetDeviceCount(&count); @@ -99,7 +109,7 @@ __global__ void copy_kernel(const T * a, T * c) template void CUDAStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -116,7 +126,7 @@ __global__ void mul_kernel(T * b, const T * c) template void CUDAStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -132,7 +142,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c) template void CUDAStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -149,7 +159,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c) template void CUDAStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -202,4 +212,3 @@ std::string getDeviceDriver(const int device) template class CUDAStream; template class CUDAStream; - diff --git a/CUDAStream.h b/CUDAStream.h index 9c436d6..6904a86 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -9,6 +9,7 @@ #include #include +#include #include "Stream.h" @@ -40,4 +41,3 @@ class CUDAStream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; - From 81fa9e19222592d420bc5af847dd7bc440d72ffe Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 12:23:21 +0100 Subject: [PATCH 088/117] Require SYCL array size to be multiple of WGSIZE --- SYCLStream.cpp | 8 ++++++++ SYCLStream.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index e8eee42..4f14590 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -24,6 +24,14 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) if (!cached) getDeviceList(); + // The array size must be divisible by WGSIZE + if (ARRAY_SIZE % WGSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << WGSIZE; + throw std::runtime_error(ss.str()); + } + array_size = ARRAY_SIZE; if (device_index >= devices.size()) diff --git a/SYCLStream.h b/SYCLStream.h index 4c0c681..8bc515d 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -7,6 +7,8 @@ #pragma once +#include + #include "Stream.h" #include "CL/sycl.hpp" From 1a9225ca9557569e01278c8cafe3c48b6fafebed Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 12:54:12 +0100 Subject: [PATCH 089/117] If building CUDA on Darwin with Xcode 7.3.1 skip becuase CUDA doesn't work this version --- CMakeLists.txt | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64bfd2e..964668e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,13 +12,21 @@ set(gpu-stream_VERSION_MINOR 0) configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) -if (${CUDA_FOUND}) +set(FLAG True) +if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") + execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION) + if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") + message("Xcode version not supported by CUDA") + set(FLAG False) + endif ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") +endif ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") +if (${FLAG} AND ${CUDA_FOUND}) list(APPEND CUDA_NVCC_FLAGS --std=c++11) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) -else (${CUDA_FOUND}) +else (${FLAG} AND ${CUDA_FOUND}) message("Skipping CUDA...") -endif (${CUDA_FOUND}) +endif (${FLAG} AND ${CUDA_FOUND}) find_package(OpenCL QUIET) if (${OpenCL_FOUND}) From 8d45e61f6c421d8e8f7514cd10ed7c979d205b1a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 13:20:15 +0100 Subject: [PATCH 090/117] Check for OpenACC support by checking the various compiler flags --- CMakeLists.txt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 964668e..cd8988d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,11 +37,20 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) -# TODO: Find OpenACC implementations somehow -if (true) +# Check compiler supports an OpenACC flag +include(CheckCXXCompilerFlag) +message("${CMAKE_CXX_COMPILER_ID}") +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC) +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") + CHECK_CXX_COMPILER_FLAG(-acc OPENACC) +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC) +endif() +if (OPENACC) add_executable(gpu-stream-acc main.cpp ACCStream.cpp) target_compile_definitions(gpu-stream-acc PUBLIC ACC) - target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") + #target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") else () message("Skipping OpenACC...") endif () From e0ca56bd67ff7444da9c62ec078bce1c0d123387 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 13:33:01 +0100 Subject: [PATCH 091/117] Set the C++11 flag when using the Cray compiler --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd8988d..dbf8c64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,13 @@ set(gpu-stream_VERSION_MINOR 0) configure_file(common.h.in common.h) +# If using the Cray compiler, manually add the C++11 flag because setting the +# standard through CMake as above doesn't set this flag with Cray +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) +endif() + + find_package(CUDA 7.0 QUIET) set(FLAG True) if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") From 8a195b64164c5e80c4c0833cad9361ccb9cb3fed Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 13:35:12 +0100 Subject: [PATCH 092/117] Remove printout of compiler id in cmake --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dbf8c64..a7437e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,7 +46,6 @@ endif (${OpenCL_FOUND}) # Check compiler supports an OpenACC flag include(CheckCXXCompilerFlag) -message("${CMAKE_CXX_COMPILER_ID}") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC) elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") From bf29b02d357842a8cfe15fd50533d6047ab98a8d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 13:35:24 +0100 Subject: [PATCH 093/117] Add banners in CMakeLists file so easy to spot build rules for versions --- CMakeLists.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7437e5..c37b270 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) endif() - +#------------------------------------------------------------------------------- +# CUDA +#------------------------------------------------------------------------------- find_package(CUDA 7.0 QUIET) set(FLAG True) if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") @@ -35,6 +37,9 @@ else (${FLAG} AND ${CUDA_FOUND}) message("Skipping CUDA...") endif (${FLAG} AND ${CUDA_FOUND}) +#------------------------------------------------------------------------------- +# OpenCL +#------------------------------------------------------------------------------- find_package(OpenCL QUIET) if (${OpenCL_FOUND}) add_executable(gpu-stream-ocl main.cpp OCLStream.cpp) @@ -44,6 +49,9 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +#------------------------------------------------------------------------------- +# OpenACC +#------------------------------------------------------------------------------- # Check compiler supports an OpenACC flag include(CheckCXXCompilerFlag) if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") @@ -61,6 +69,9 @@ else () message("Skipping OpenACC...") endif () +#------------------------------------------------------------------------------- +# SYCL +#------------------------------------------------------------------------------- # TODO: Find SYCL implementations somehow if (true) # ComputeCpp From e095cb67f8a65135a58d9cfc622738951fc163df Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 13:37:12 +0100 Subject: [PATCH 094/117] Remove ugly CMake endif text in parenthesis --- CMakeLists.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c37b270..7196066 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ configure_file(common.h.in common.h) # standard through CMake as above doesn't set this flag with Cray if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) -endif() +endif () #------------------------------------------------------------------------------- # CUDA @@ -27,15 +27,15 @@ if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") message("Xcode version not supported by CUDA") set(FLAG False) - endif ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") -endif ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") + endif () +endif () if (${FLAG} AND ${CUDA_FOUND}) list(APPEND CUDA_NVCC_FLAGS --std=c++11) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) -else (${FLAG} AND ${CUDA_FOUND}) +else () message("Skipping CUDA...") -endif (${FLAG} AND ${CUDA_FOUND}) +endif () #------------------------------------------------------------------------------- # OpenCL @@ -45,9 +45,9 @@ if (${OpenCL_FOUND}) add_executable(gpu-stream-ocl main.cpp OCLStream.cpp) target_compile_definitions(gpu-stream-ocl PUBLIC OCL) target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY}) -else (${OpenCL_FOUND}) +else () message("Skipping OpenCL...") -endif (${OpenCL_FOUND}) +endif () #------------------------------------------------------------------------------- # OpenACC @@ -60,7 +60,8 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") CHECK_CXX_COMPILER_FLAG(-acc OPENACC) elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC) -endif() +endif () + if (OPENACC) add_executable(gpu-stream-acc main.cpp ACCStream.cpp) target_compile_definitions(gpu-stream-acc PUBLIC ACC) From eb10c716f2b25caec7b3442ce0df98b7c8d596df Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:08:08 +0100 Subject: [PATCH 095/117] First attempt at OpenMP 4.5 --- OMP45Stream.cpp | 135 ++++++++++++++++++++++++++++++++++++++++++++++++ OMP45Stream.h | 40 ++++++++++++++ main.cpp | 6 +++ 3 files changed, 181 insertions(+) create mode 100644 OMP45Stream.cpp create mode 100644 OMP45Stream.h diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp new file mode 100644 index 0000000..779b217 --- /dev/null +++ b/OMP45Stream.cpp @@ -0,0 +1,135 @@ + +#include "OMP45Stream.h" + +template +OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +{ + omp_set_default_device(device); + + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +OMP45Stream::~OMP45Stream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::copy() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *c = this->c; + #pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP45Stream::mul() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP45Stream::add() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP45Stream::triad() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices + int count = omp_get_num_devices(); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class OMP45Stream; +template class OMP45Stream; + diff --git a/OMP45Stream.h b/OMP45Stream.h new file mode 100644 index 0000000..98c9705 --- /dev/null +++ b/OMP45Stream.h @@ -0,0 +1,40 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenMP 4.5" + +template +class OMP45Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP45Stream(const unsigned int, T*, T*, T*, int); + ~OMP45Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; + diff --git a/main.cpp b/main.cpp index 832c657..2f0d6fc 100644 --- a/main.cpp +++ b/main.cpp @@ -32,6 +32,8 @@ #include "SYCLStream.h" #elif defined(OMP3) #include "OMP3Stream.h" +#elif defined(OMP45) +#include "OMP45Stream.h" #endif // Default size of 2^26 @@ -122,6 +124,10 @@ void run() // Use the "reference" OpenMP 3 implementation stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); +#elif defined(OMP45) + // Use the "reference" OpenMP 3 implementation + stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + #endif stream->write_arrays(a, b, c); From 4954ef7cf00e0f09065146b82e8bfcbef749a461 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:17:06 +0100 Subject: [PATCH 096/117] Add map clauses to OpenMP 4.5 kernels --- OMP45Stream.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index 779b217..f44f66f 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -12,7 +12,7 @@ OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int this->a = a; this->b = b; this->c = c; - #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size]) {} } @@ -54,7 +54,7 @@ void OMP45Stream::copy() unsigned int array_size = this->array_size; T *a = this->a; T *c = this->c; - #pragma omp target teams distribute parallel for simd + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) for (int i = 0; i < array_size; i++) { c[i] = a[i]; @@ -69,7 +69,7 @@ void OMP45Stream::mul() unsigned int array_size = this->array_size; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd + #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) for (int i = 0; i < array_size; i++) { b[i] = scalar * c[i]; @@ -83,7 +83,7 @@ void OMP45Stream::add() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) for (int i = 0; i < array_size; i++) { c[i] = a[i] + b[i]; @@ -99,7 +99,7 @@ void OMP45Stream::triad() T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target teams distribute parallel for simd + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) for (int i = 0; i < array_size; i++) { a[i] = b[i] + scalar * c[i]; From 55a858e0c0a0bc0f30fdb222d1f8961cabc009c3 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:43:52 +0100 Subject: [PATCH 097/117] Use 2^25 as default size because 2^26 gives too many thread blocks for CUDA --- main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.cpp b/main.cpp index 2f0d6fc..80791d6 100644 --- a/main.cpp +++ b/main.cpp @@ -36,8 +36,8 @@ #include "OMP45Stream.h" #endif -// Default size of 2^26 -unsigned int ARRAY_SIZE = 67108864; +// Default size of 2^25 +unsigned int ARRAY_SIZE = 33554432; unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; From 31cb567e211dc21be720d660d1792778b4231572 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:51:19 +0100 Subject: [PATCH 098/117] Switch data from 1.0, 2.0 and 3.0 to 0.1, 0.2, and 0.3 resp. Using integers for maths gets unstable past 38 interations even in double precision. Using the original values/10 is safe up to the default 100 iterations. --- ACCStream.cpp | 5 ++--- CUDAStream.cu | 4 ++-- KOKKOSStream.cpp | 13 ++++++------- OCLStream.cpp | 3 +-- OMP3Stream.cpp | 5 ++--- OMP45Stream.cpp | 5 ++--- RAJAStream.cpp | 13 ++++++------- SYCLStream.cpp | 4 ++-- main.cpp | 10 +++++----- 9 files changed, 28 insertions(+), 34 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index 85bf600..8e3bb32 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -65,7 +65,7 @@ void ACCStream::copy() template void ACCStream::mul() { - const T scalar = 3.0; + const T scalar = 0.3; unsigned int array_size = this->array_size; T *b = this->b; @@ -94,7 +94,7 @@ void ACCStream::add() template void ACCStream::triad() { - const T scalar = 3.0; + const T scalar = 0.3; unsigned int array_size = this->array_size; T *a = this->a; @@ -133,4 +133,3 @@ std::string getDeviceDriver(const int) } template class ACCStream; template class ACCStream; - diff --git a/CUDAStream.cu b/CUDAStream.cu index 2a28f9c..21d36fa 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -118,7 +118,7 @@ void CUDAStream::copy() template __global__ void mul_kernel(T * b, const T * c) { - const T scalar = 3.0; + const T scalar = 0.3; const int i = blockDim.x * blockIdx.x + threadIdx.x; b[i] = scalar * c[i]; } @@ -151,7 +151,7 @@ void CUDAStream::add() template __global__ void triad_kernel(T * a, const T * b, const T * c) { - const T scalar = 3.0; + const T scalar = 0.3; const int i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = b[i] + scalar * c[i]; } diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index d93b6d7..a9bfcd9 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -70,7 +70,7 @@ void KOKKOSStream::copy() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { c[index] = a[index]; }); @@ -84,8 +84,8 @@ void KOKKOSStream::mul() View b(*d_b); View c(*d_c); - const T scalar = 3.0; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + const T scalar = 0.3; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void KOKKOSStream::add() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { c[index] = a[index] + b[index]; }); @@ -114,8 +114,8 @@ void KOKKOSStream::triad() View b(*d_b); View c(*d_c); - const T scalar = 3.0; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + const T scalar = 0.3; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { a[index] = b[index] + scalar*c[index]; }); @@ -142,4 +142,3 @@ std::string getDeviceDriver(const int device) //template class KOKKOSStream; template class KOKKOSStream; - diff --git a/OCLStream.cpp b/OCLStream.cpp index f7c538e..50ad543 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -14,7 +14,7 @@ void getDeviceList(void); std::string kernels{R"CLC( - constant TYPE scalar = 3.0; + constant TYPE scalar = 0.3; kernel void copy( global const TYPE * restrict a, @@ -253,4 +253,3 @@ std::string getDeviceDriver(const int device) template class OCLStream; template class OCLStream; - diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index 8899cff..78b0124 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -56,7 +56,7 @@ void OMP3Stream::copy() template void OMP3Stream::mul() { - const T scalar = 3.0; + const T scalar = 0.3; #pragma omp parallel for for (int i = 0; i < array_size; i++) { @@ -77,7 +77,7 @@ void OMP3Stream::add() template void OMP3Stream::triad() { - const T scalar = 3.0; + const T scalar = 0.3; #pragma omp parallel for for (int i = 0; i < array_size; i++) { @@ -103,4 +103,3 @@ std::string getDeviceDriver(const int) template class OMP3Stream; template class OMP3Stream; - diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index f44f66f..7f6e66a 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -64,7 +64,7 @@ void OMP45Stream::copy() template void OMP45Stream::mul() { - const T scalar = 3.0; + const T scalar = 0.3; unsigned int array_size = this->array_size; T *b = this->b; @@ -93,7 +93,7 @@ void OMP45Stream::add() template void OMP45Stream::triad() { - const T scalar = 3.0; + const T scalar = 0.3; unsigned int array_size = this->array_size; T *a = this->a; @@ -132,4 +132,3 @@ std::string getDeviceDriver(const int) } template class OMP45Stream; template class OMP45Stream; - diff --git a/RAJAStream.cpp b/RAJAStream.cpp index eb98d54..e418f09 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -66,7 +66,7 @@ void RAJAStream::copy() { T* a = d_a; T* c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index]; }); @@ -77,8 +77,8 @@ void RAJAStream::mul() { T* b = d_b; T* c = d_c; - const T scalar = 3.0; - forall(index_set, [=] RAJA_DEVICE (int index) + const T scalar = 0.3; + forall(index_set, [=] RAJA_DEVICE (int index) { b[index] = scalar*c[index]; }); @@ -90,7 +90,7 @@ void RAJAStream::add() T* a = d_a; T* b = d_b; T* c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index] + b[index]; }); @@ -102,8 +102,8 @@ void RAJAStream::triad() T* a = d_a; T* b = d_b; T* c = d_c; - const T scalar = 3.0; - forall(index_set, [=] RAJA_DEVICE (int index) + const T scalar = 0.3; + forall(index_set, [=] RAJA_DEVICE (int index) { a[index] = b[index] + scalar*c[index]; }); @@ -128,4 +128,3 @@ std::string getDeviceDriver(const int device) template class RAJAStream; template class RAJAStream; - diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 4f14590..d039d70 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -78,7 +78,7 @@ void SYCLStream::copy() template void SYCLStream::mul() { - const T scalar = 3.0; + const T scalar = 0.3; queue->submit([&](handler &cgh) { auto kb = d_b->template get_access(cgh); @@ -110,7 +110,7 @@ void SYCLStream::add() template void SYCLStream::triad() { - const T scalar = 3.0; + const T scalar = 0.3; queue->submit([&](handler &cgh) { auto ka = d_a->template get_access(cgh); diff --git a/main.cpp b/main.cpp index 80791d6..933c9a9 100644 --- a/main.cpp +++ b/main.cpp @@ -83,8 +83,8 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, 1.0); - std::vector b(ARRAY_SIZE, 2.0); + std::vector a(ARRAY_SIZE, 0.1); + std::vector b(ARRAY_SIZE, 0.2); std::vector c(ARRAY_SIZE, 0.0); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed @@ -216,11 +216,11 @@ template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) { // Generate correct solution - T goldA = 1.0; - T goldB = 2.0; + T goldA = 0.1; + T goldB = 0.2; T goldC = 0.0; - const T scalar = 3.0; + const T scalar = 0.3; for (unsigned int i = 0; i < ntimes; i++) { From eae8da57ac1216c620c66658ec8e13db4b888ed1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:57:20 +0100 Subject: [PATCH 099/117] Delete commented out C++ flag for OpenACC as no longer needed --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7196066..0441b4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,6 @@ endif () if (OPENACC) add_executable(gpu-stream-acc main.cpp ACCStream.cpp) target_compile_definitions(gpu-stream-acc PUBLIC ACC) - #target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") else () message("Skipping OpenACC...") endif () From 9b2a586e08fd5087d5d84091de76ea0d468f9259 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 15:57:39 +0100 Subject: [PATCH 100/117] Add rule to build OMP4.5 on Cray --- CMakeLists.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0441b4a..c375072 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,22 @@ else () message("Skipping OpenACC...") endif () +#------------------------------------------------------------------------------- +# OpenMP 3.0 +#------------------------------------------------------------------------------- + +# TODO + +#------------------------------------------------------------------------------- +# OpenMP 4.5 +#------------------------------------------------------------------------------- +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5) + add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp) + target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45) + endif () +endif () + #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- From 494e89d16bb0b5d42e8cbcce9bab18e34ff28644 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 16:02:34 +0100 Subject: [PATCH 101/117] Add placeholder banners for CMake build systems to fix --- CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c375072..43328d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,18 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") endif () endif () +#------------------------------------------------------------------------------- +# RAJA +#------------------------------------------------------------------------------- + +# TODO + +#------------------------------------------------------------------------------- +# Kokkos +#------------------------------------------------------------------------------- + +# TODO + #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- From 9449e0888637ba7e14262d1e11038f4cf4b81f16 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 16:23:14 +0100 Subject: [PATCH 102/117] update readme --- README.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5a84b50..fee2ef6 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,30 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time. +There are multiple implementations of this benchmark in a variety of programming models. +Currently implemented are: + - OpenCL + - CUDA + - OpenACC + - OpenMP 3 and 4.5 + - Kokkos + - RAJA + - SYCL + Usage ----- -Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5) +CMake 3.2 or above is required. +Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports. -Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` +Generate the Makefile with `cmake .` -Android -------- +Build the various binaries with `make` + +This will generate binaries of the form `./gpu-stream-*` where `*` identifies the programming model. + +Android (outdated instructions) +------------------ Assuming you have a recent Android NDK available, you can use the toolchain that it provides to build GPU-STREAM. You should first @@ -48,6 +63,7 @@ Run GPU-STREAM from an adb shell: # Use float if device doesn't support double, and reduce array size ./gpu-stream-ocl --float -n 6 -s 10000000 + Results ------- From d420032c662d8580b81be81d247b732c7366aa80 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 17:15:43 +0100 Subject: [PATCH 103/117] Remove warning about iteration count when using floats as new data values work for 100 iterations --- main.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/main.cpp b/main.cpp index 933c9a9..5379bb9 100644 --- a/main.cpp +++ b/main.cpp @@ -306,7 +306,6 @@ void parseArguments(int argc, char *argv[]) else if (!std::string("--float").compare(argv[i])) { use_float = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; } else if (!std::string("--help").compare(argv[i]) || !std::string("-h").compare(argv[i])) From bf9c6fb6cd373b7f6233deba5e6fea90bf543df3 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 17:21:52 +0100 Subject: [PATCH 104/117] Add -fopenacc flag on linking with GCC compiler --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43328d0..ed048c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,9 @@ endif () include(CheckCXXCompilerFlag) if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC) + if (OPENACC) + list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc) + endif () elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") CHECK_CXX_COMPILER_FLAG(-acc OPENACC) elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") From 5638cbb2837d234419e2eae0c71f1d2ad1c2d486 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 17:49:48 +0100 Subject: [PATCH 105/117] Check for OpenMP support and build OMP3 version --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed048c5..4bcd688 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,8 +75,13 @@ endif () #------------------------------------------------------------------------------- # OpenMP 3.0 #------------------------------------------------------------------------------- - -# TODO +find_package(OpenMP QUIET) +if (${OpenMP_FOUND}) + add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp) + target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3) +else () + message("Skipping OpenMP 3...") +endif () #------------------------------------------------------------------------------- # OpenMP 4.5 From d4e74a88e9b3c870256d7d1b0c513ceb92286252 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 11 May 2016 17:53:33 +0100 Subject: [PATCH 106/117] Add binary names to gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 0fa7243..4d2865c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ common.h gpu-stream-cuda gpu-stream-ocl +gpu-stream-acc +gpu-stream-omp3 +gpu-stream-omp45 +gpu-stream-sycl *.o *.tar From 7cd14f480d675d0b53dd38cf88eea3de7961ee49 Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 11 May 2016 22:00:04 +0100 Subject: [PATCH 107/117] [SYCL] Auto-detect presence of CL/sycl.hpp and ComputeCpp --- CMakeLists.txt | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bcd688..34847ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) project(gpu-stream) +include(CheckIncludeFileCXX) + set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) @@ -108,20 +110,24 @@ endif () #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- -# TODO: Find SYCL implementations somehow -if (true) - # ComputeCpp - # TODO: Sort this out properly! - add_custom_target(gpu-stream-sycl - COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -o SYCLStream.bc -c - COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c - COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL - ) - - # triSYCL - #add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) - #target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) - #set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) +set(CMAKE_REQUIRED_FLAGS "-std=c++11") +check_include_file_cxx("CL/sycl.hpp" HAS_SYCL) +if (HAS_SYCL) + find_program(COMPUTECPP "compute++") + if (COMPUTECPP) + message(STATUS "Using ComputeCpp for SYCL compilation") + add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + add_dependencies(gpu-stream-sycl SYCLStream.sycl) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl) + target_link_libraries(gpu-stream-sycl SYCL OpenCL) + else() + message(STATUS "Using header-only SYCL implementation") + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) + endif(COMPUTECPP) else () message("Skipping SYCL...") -endif () +endif (HAS_SYCL) From 3ebad06bd4ef95530b10ea47c0b26f9c43aeceba Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 11 May 2016 22:22:20 +0100 Subject: [PATCH 108/117] [SYCL] Fix detection of CL/sycl.hpp for C++14 versions --- CMakeLists.txt | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34847ba..8d10c40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) project(gpu-stream) include(CheckIncludeFileCXX) +include(CheckCXXCompilerFlag) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) @@ -110,22 +111,31 @@ endif () #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- -set(CMAKE_REQUIRED_FLAGS "-std=c++11") +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR + "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + # Use C++14 if available, otherwise drop back to C++11 + check_cxx_compiler_flag("-std=c++14" CXX14) + if (CXX14) + set(CMAKE_REQUIRED_FLAGS "-std=c++14") + else() + set(CMAKE_REQUIRED_FLAGS "-std=c++11") + endif() +endif() + check_include_file_cxx("CL/sycl.hpp" HAS_SYCL) if (HAS_SYCL) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + find_program(COMPUTECPP "compute++") if (COMPUTECPP) message(STATUS "Using ComputeCpp for SYCL compilation") add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c) - add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) add_dependencies(gpu-stream-sycl SYCLStream.sycl) - target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl) target_link_libraries(gpu-stream-sycl SYCL OpenCL) else() message(STATUS "Using header-only SYCL implementation") - add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) - target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) endif(COMPUTECPP) else () From f6fca3ac065b0416417829a4ed48db1235bd7b8b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 12:30:06 +0100 Subject: [PATCH 109/117] Add Kokkos building to CMake config --- CMakeLists.txt | 11 +++++++++-- KokkosMakefile | 10 ++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 KokkosMakefile diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d10c40..8501add 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,8 +105,15 @@ endif () #------------------------------------------------------------------------------- # Kokkos #------------------------------------------------------------------------------- - -# TODO +if (BUILD_KOKKOS) + if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") + add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile) + else() + message("Skipping Kokkos (requires Linux)") + endif() +else() + message("Skipping Kokkos... (use -DBUILD_KOKKOS=1 to opt in)") +endif() #------------------------------------------------------------------------------- # SYCL diff --git a/KokkosMakefile b/KokkosMakefile new file mode 100644 index 0000000..2d08bc9 --- /dev/null +++ b/KokkosMakefile @@ -0,0 +1,10 @@ + +default: gpu-stream-kokkos + +include /modules/modules/kokkos/Makefile.kokkos + +gpu-stream-kokkos: main.o KOKKOSStream.o + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS From 88d194b75cb7634556b49728a68912be02bdfae7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 12:30:35 +0100 Subject: [PATCH 110/117] Use a variable to get Kokkos Path --- KokkosMakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KokkosMakefile b/KokkosMakefile index 2d08bc9..6e71e67 100644 --- a/KokkosMakefile +++ b/KokkosMakefile @@ -1,7 +1,7 @@ default: gpu-stream-kokkos -include /modules/modules/kokkos/Makefile.kokkos +include $(KOKKOS_PATH)/Makefile.kokkos gpu-stream-kokkos: main.o KOKKOSStream.o $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS From 2381f059edcf855cf099231e82763dce938b6e52 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 12:31:16 +0100 Subject: [PATCH 111/117] Set KOKKOS_PATH to build Kokkos version --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8501add..1b46398 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,14 +105,14 @@ endif () #------------------------------------------------------------------------------- # Kokkos #------------------------------------------------------------------------------- -if (BUILD_KOKKOS) +if (KOKKOS_PATH) if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile) else() message("Skipping Kokkos (requires Linux)") endif() else() - message("Skipping Kokkos... (use -DBUILD_KOKKOS=1 to opt in)") + message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)") endif() #------------------------------------------------------------------------------- From d75084b753521c11c69b264973c729f84cc9999c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 12:35:47 +0100 Subject: [PATCH 112/117] Fix Kokkos CMake so it works.. --- CMakeLists.txt | 2 +- KokkosMakefile | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b46398..515b594 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,7 +107,7 @@ endif () #------------------------------------------------------------------------------- if (KOKKOS_PATH) if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") - add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile) + add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH}) else() message("Skipping Kokkos (requires Linux)") endif() diff --git a/KokkosMakefile b/KokkosMakefile index 6e71e67..83e00b9 100644 --- a/KokkosMakefile +++ b/KokkosMakefile @@ -4,7 +4,8 @@ default: gpu-stream-kokkos include $(KOKKOS_PATH)/Makefile.kokkos gpu-stream-kokkos: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS %.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS + $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS + From 942188d836c5e5e8cb914689d2dd32ca52e00e8f Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 12:53:26 +0100 Subject: [PATCH 113/117] Add copyright header to source with it missing --- ACCStream.cpp | 6 ++++++ ACCStream.h | 9 +++++++-- OMP3Stream.cpp | 6 ++++++ OMP3Stream.h | 7 ++++++- OMP45Stream.cpp | 6 ++++++ OMP45Stream.h | 9 +++++++-- 6 files changed, 38 insertions(+), 5 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index 8e3bb32..48e042e 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #include "ACCStream.h" template diff --git a/ACCStream.h b/ACCStream.h index d6e5728..48fea55 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #pragma once #include @@ -34,6 +40,5 @@ class ACCStream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; +}; diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index 78b0124..fe8323a 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #include "OMP3Stream.h" template diff --git a/OMP3Stream.h b/OMP3Stream.h index 15172c3..0f14300 100644 --- a/OMP3Stream.h +++ b/OMP3Stream.h @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #pragma once #include @@ -32,4 +38,3 @@ class OMP3Stream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; - diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index 7f6e66a..f849c39 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #include "OMP45Stream.h" template diff --git a/OMP45Stream.h b/OMP45Stream.h index 98c9705..bd812a1 100644 --- a/OMP45Stream.h +++ b/OMP45Stream.h @@ -1,4 +1,10 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + #pragma once #include @@ -35,6 +41,5 @@ class OMP45Stream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; +}; From 2001ab5fb1a253ad1cb0c2f483b393de0e45b2f6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 15:40:22 +0100 Subject: [PATCH 114/117] Build against a RAJA installation in the CMake build system --- CMakeLists.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 515b594..1b2d28d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,8 +99,20 @@ endif () #------------------------------------------------------------------------------- # RAJA #------------------------------------------------------------------------------- - -# TODO +if (RAJA_PATH) + find_package(OpenMP) + find_package(CUDA 7.5) + list(APPEND CUDA_NVCC_FLAGS "-arch compute_35") + list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA) + cuda_include_directories(${RAJA_PATH}/include) + set_source_files_properties(main.cpp RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) + cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp) + target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA") +else() + message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)") +endif() #------------------------------------------------------------------------------- # Kokkos From 2033f3f0c6012fe1445935374ba67a67e2092c7c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 May 2016 16:37:06 +0100 Subject: [PATCH 115/117] Break RAJA build but fix all the others.. --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2d28d..acaa2a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ endif () #------------------------------------------------------------------------------- # RAJA #------------------------------------------------------------------------------- + if (RAJA_PATH) find_package(OpenMP) find_package(CUDA 7.5) @@ -107,8 +108,9 @@ if (RAJA_PATH) list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS}) list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA) cuda_include_directories(${RAJA_PATH}/include) - set_source_files_properties(main.cpp RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) + set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp) + target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA) target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA") else() message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)") From 8be4b26bb18cbbba770e17f3814f071e6a5f5e94 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 30 Jun 2016 16:24:32 +0100 Subject: [PATCH 116/117] Add restrict to extra pointers in OpenACC --- ACCStream.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index 48e042e..d3fbd6a 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -59,8 +59,8 @@ template void ACCStream::copy() { unsigned int array_size = this->array_size; - T *a = this->a; - T *c = this->c; + T * restrict a = this->a; + T * restrict c = this->c; #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait for (int i = 0; i < array_size; i++) { @@ -74,8 +74,8 @@ void ACCStream::mul() const T scalar = 0.3; unsigned int array_size = this->array_size; - T *b = this->b; - T *c = this->c; + T * restrict b = this->b; + T * restrict c = this->c; #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait for (int i = 0; i < array_size; i++) { @@ -87,9 +87,9 @@ template void ACCStream::add() { unsigned int array_size = this->array_size; - T *a = this->a; - T *b = this->b; - T *c = this->c; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait for (int i = 0; i < array_size; i++) { @@ -103,9 +103,9 @@ void ACCStream::triad() const T scalar = 0.3; unsigned int array_size = this->array_size; - T *a = this->a; - T *b = this->b; - T *c = this->c; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait for (int i = 0; i < array_size; i++) { From 04f321db8581a2b29a1d5766c70ed98b5011e164 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 30 Jun 2016 16:32:52 +0100 Subject: [PATCH 117/117] Replace cudaDeviceSyncronise with Kokkos::fence() function --- KOKKOSStream.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index a9bfcd9..d73f7d5 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -74,7 +74,7 @@ void KOKKOSStream::copy() { c[index] = a[index]; }); - cudaDeviceSynchronize(); + Kokkos::fence(); } template @@ -89,7 +89,7 @@ void KOKKOSStream::mul() { b[index] = scalar*c[index]; }); - cudaDeviceSynchronize(); + Kokkos::fence(); } template @@ -103,8 +103,7 @@ void KOKKOSStream::add() { c[index] = a[index] + b[index]; }); - - cudaDeviceSynchronize(); + Kokkos::fence(); } template @@ -119,8 +118,7 @@ void KOKKOSStream::triad() { a[index] = b[index] + scalar*c[index]; }); - - cudaDeviceSynchronize(); + Kokkos::fence(); } void listDevices(void)