From f2536f87647a3524af3e21327db2298ee3fbfa54 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 16 Jul 2015 18:06:35 +0100 Subject: [PATCH] Enqueue CUDA kernels --- cuda-stream.cu | 51 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/cuda-stream.cu b/cuda-stream.cu index bf3d5af..19304b8 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -168,12 +168,19 @@ int main(int argc, char *argv[]) h_c[i] = 0.0; } + // Create device buffers + DATATYPE * d_a, * d_b, *d_c; + cudaMalloc(&d_a, ARRAY_SIZE*sizeof(DATATYPE)); + cudaMalloc(&d_b, ARRAY_SIZE*sizeof(DATATYPE)); + cudaMalloc(&d_c, ARRAY_SIZE*sizeof(DATATYPE)); // Copy host memory to device - + cudaMemcpy(d_a, h_a, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, h_b, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyHostToDevice); + cudaMemcpy(d_c, h_c, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyHostToDevice); // Make sure the copies are finished - + cudaDeviceSynchronize(); // List of times std::vector< std::vector > timings; @@ -184,56 +191,42 @@ int main(int argc, char *argv[]) // Main loop for (unsigned int k = 0; k < NTIMES; k++) { - /*std::vector times; + std::vector times; t1 = std::chrono::high_resolution_clock::now(); - copy( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_c); - queue.finish(); + copy<<>>(d_a, d_c); + cudaDeviceSynchronize(); t2 = std::chrono::high_resolution_clock::now(); times.push_back(std::chrono::duration_cast >(t2 - t1).count()); t1 = std::chrono::high_resolution_clock::now(); - mul( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_b, d_c); - queue.finish(); + mul<<>>(d_b, d_c); + cudaDeviceSynchronize(); t2 = std::chrono::high_resolution_clock::now(); times.push_back(std::chrono::duration_cast >(t2 - t1).count()); t1 = std::chrono::high_resolution_clock::now(); - add( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); + add<<>>(d_a, d_b, d_c); + cudaDeviceSynchronize(); t2 = std::chrono::high_resolution_clock::now(); times.push_back(std::chrono::duration_cast >(t2 - t1).count()); t1 = std::chrono::high_resolution_clock::now(); - triad( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); + triad<<>>(d_a, d_b, d_c); + cudaDeviceSynchronize(); t2 = std::chrono::high_resolution_clock::now(); times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - timings.push_back(times);*/ + timings.push_back(times); } // Check solutions - + cudaMemcpy(h_a, d_a, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyDeviceToHost); + cudaMemcpy(h_b, d_b, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyDeviceToHost); + cudaMemcpy(h_c, d_c, ARRAY_SIZE*sizeof(DATATYPE), cudaMemcpyDeviceToHost); check_solution(h_a, h_b, h_c); // Crunch results