From b3cf9992bb7784fc7219e2edf289ffeb991a234d Mon Sep 17 00:00:00 2001 From: Simon McIntosh-Smith Date: Mon, 7 Nov 2016 23:35:00 +0000 Subject: [PATCH 01/76] Fixed broken link to new GPU-STREAM webpage --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b024915..0d181d5 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Currently implemented are: Website ------- -[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/) +[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/) Usage ----- From edd65dacb1c29749637c5ba1436bf84cc978c4d2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 22 Nov 2016 20:06:54 +0000 Subject: [PATCH 02/76] Add Kokkos Makefile for CPU --- KokkosCPUMakefile | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 KokkosCPUMakefile diff --git a/KokkosCPUMakefile b/KokkosCPUMakefile new file mode 100644 index 0000000..caa8b77 --- /dev/null +++ b/KokkosCPUMakefile @@ -0,0 +1,10 @@ + +default: gpu-stream-kokkos + +include $(KOKKOS_PATH)/Makefile.kokkos + +gpu-stream-kokkos: main.o KOKKOSStream.o + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3 + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3 From ec2bf50e750c6f574d42e761129bdfd3f972ea99 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 30 Jan 2017 13:52:45 +0000 Subject: [PATCH 03/76] Version bump --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index c1ca69f..2b6e459 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "devel" +#define VERSION_STRING "3.0" #include "Stream.h" From caf367f8fbb58fa4f1bc6185269bc95f96980ec8 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 30 Jan 2017 16:12:09 +0000 Subject: [PATCH 04/76] Remove broken common.h generation from CMake build --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f3439e..5f31cf1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,6 @@ include(CheckCXXCompilerFlag) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 2) -configure_file(common.h.in common.h) include_directories(${CMAKE_BINARY_DIR}) # Use 'Release' if no build type specified From c320014d2808fb68ee69a8de929f885eec729194 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 14:58:05 +0000 Subject: [PATCH 05/76] Add CUDA Makefile --- CUDA.make | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 CUDA.make diff --git a/CUDA.make b/CUDA.make new file mode 100644 index 0000000..2434754 --- /dev/null +++ b/CUDA.make @@ -0,0 +1,4 @@ + +cuda-stream: main.cpp CUDAStream.cu + nvcc -std=c++11 -O3 -DCUDA $^ -o $@ + From c904719f2b6479f4fd35bf50c811a2f6fc421ccf Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 14:58:24 +0000 Subject: [PATCH 06/76] Add OpenCL Makefile --- OpenCL.make | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 OpenCL.make diff --git a/OpenCL.make b/OpenCL.make new file mode 100644 index 0000000..447deca --- /dev/null +++ b/OpenCL.make @@ -0,0 +1,4 @@ + +ocl-stream: main.cpp OCLStream.cpp + $(CXX) -O3 -std=c++11 -DOCL $^ -lOpenCL -o $@ + From 1d4b809b446612e0fa1c25054451e1040c3f3ff6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 14:58:34 +0000 Subject: [PATCH 07/76] Remove CMake from project CMake is unable to use multiple compilers within a single build. We require building multiple binaries, one for each model, and as such they often require different compilers for each. Therefore we feel it is simpler to provide a simple, sample Makefile for each model. Some common configurations will be specalised in due course. --- CMakeLists.txt | 197 ------------------------------------------------- 1 file changed, 197 deletions(-) delete mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 5f31cf1..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,197 +0,0 @@ - -cmake_minimum_required(VERSION 3.2) - -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - - -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -project(gpu-stream) - -include(CheckIncludeFileCXX) -include(CheckCXXCompilerFlag) - -set(gpu-stream_VERSION_MAJOR 2) -set(gpu-stream_VERSION_MINOR 2) - -include_directories(${CMAKE_BINARY_DIR}) - -# Use 'Release' if no build type specified -if (NOT CMAKE_BUILD_TYPE) - message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'") - set(CMAKE_BUILD_TYPE "Release") -endif() - -# If using the Cray compiler, manually add the C++11 flag because setting the -# standard through CMake as above doesn't set this flag with Cray -if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") - list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) -endif () - -#------------------------------------------------------------------------------- -# HIP -#------------------------------------------------------------------------------- -find_package(HIP QUIET) -if(${HIP_FOUND}) - list(APPEND HIP_HIPCC_FLAGS --std=c++11) - hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu) - target_compile_definitions(gpu-stream-hip PUBLIC HIP) -else() - message("Skipping HIP...") -endif() - -#------------------------------------------------------------------------------- -# CUDA -#------------------------------------------------------------------------------- -find_package(CUDA 7.0 QUIET) -set(FLAG True) -if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") - execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION) - if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") - message("Xcode version not supported by CUDA") - set(FLAG False) - endif () -endif () -if (${FLAG} AND ${CUDA_FOUND}) - list(APPEND CUDA_NVCC_FLAGS --std=c++11) - cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) - target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) -else () - message("Skipping CUDA...") -endif () - -#------------------------------------------------------------------------------- -# OpenCL -#------------------------------------------------------------------------------- -find_package(OpenCL QUIET) -if (${OpenCL_FOUND}) - add_executable(gpu-stream-ocl main.cpp OCLStream.cpp) - target_compile_definitions(gpu-stream-ocl PUBLIC OCL) - target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY}) -else () - message("Skipping OpenCL...") -endif () - -#------------------------------------------------------------------------------- -# OpenACC -#------------------------------------------------------------------------------- -# Check compiler supports an OpenACC flag -include(CheckCXXCompilerFlag) -if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") - CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC) - if (OPENACC) - list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc) - endif () -elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") - CHECK_CXX_COMPILER_FLAG(-acc OPENACC) -elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") - CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC) -endif () - -if (OPENACC) - add_executable(gpu-stream-acc main.cpp ACCStream.cpp) - target_compile_definitions(gpu-stream-acc PUBLIC ACC) -else () - message("Skipping OpenACC...") -endif () - -#------------------------------------------------------------------------------- -# OpenMP 3.0 -#------------------------------------------------------------------------------- -find_package(OpenMP QUIET) -if (${OpenMP_FOUND}) - add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp) - target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3) -else () - message("Skipping OpenMP 3...") -endif () - -#------------------------------------------------------------------------------- -# OpenMP 4.5 -#------------------------------------------------------------------------------- -if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5) - add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp) - target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45) - endif () -endif () - -#------------------------------------------------------------------------------- -# RAJA -#------------------------------------------------------------------------------- - -if (RAJA_PATH) - find_package(OpenMP) - find_package(CUDA 7.5) - list(APPEND CUDA_NVCC_FLAGS "-arch compute_35") - list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda) - list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA) - cuda_include_directories(${RAJA_PATH}/include) - set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) - cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp) - target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA) - target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA") -else() - message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)") -endif() - -#------------------------------------------------------------------------------- -# Kokkos -#------------------------------------------------------------------------------- -if (KOKKOS_PATH) - if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") - add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH}) - else() - message("Skipping Kokkos (requires Linux)") - endif() -else() - message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)") -endif() - -#------------------------------------------------------------------------------- -# SYCL -#------------------------------------------------------------------------------- -if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR - "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") - # Use C++14 if available, otherwise drop back to C++11 - check_cxx_compiler_flag("-std=c++14" CXX14) - if (CXX14) - set(CMAKE_REQUIRED_FLAGS "-std=c++14") - else() - set(CMAKE_REQUIRED_FLAGS "-std=c++11") - endif() -endif() - -check_include_file_cxx("CL/sycl.hpp" HAS_SYCL) -if (HAS_SYCL) - add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) - target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) - - # The user must define this in order to use FindComputeCpp - if (COMPUTECPP_PACKAGE_ROOT_DIR) - message(STATUS "Using ComputeCpp for SYCL compilation") - include(FindComputeCpp) - - include_directories(${COMPUTECPP_INCLUDE_DIRECTORY}) - - set(SOURCE_NAME "SYCLStream") - - target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS}) - add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp - ${CMAKE_CURRENT_BINARY_DIR}) - else() - message(STATUS "Using header-only SYCL implementation") - set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) - endif() -else () - message("Skipping SYCL...") -endif (HAS_SYCL) From 436c3899c777c7712aff16efa47e8cd2ff40561b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:10:23 +0000 Subject: [PATCH 08/76] Add OpenMP makefile that targets CPUs and GPUs, with common defaults. --- OpenMP.make | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 OpenMP.make diff --git a/OpenMP.make b/OpenMP.make new file mode 100644 index 0000000..115c678 --- /dev/null +++ b/OpenMP.make @@ -0,0 +1,29 @@ + +ifndef COMPILER +$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +endif + +COMPILER_ = $(CXX) +COMPILER_GNU = g++ +COMPILER_INTEL = icpc +COMPILER_CRAY = CC +CC = $(COMPILER_$(COMPILER)) + +FLAGS_ = -O3 +FLAGS_GNU = -O3 -std=c++11 +FLAGS_INTEL = -O3 -std=c++11 +FLAGS_CRAY = -O3 -hstd=c++11 +CFLAGS = $(FLAGS_$(COMPILER)) + +OMP_ = +OMP_GNU = -fopenmp +OMP_INTEL = -qopenmp +OMP_CRAY = +OMP = $(OMP_$(COMPILER)) + +omp-stream: main.cpp OMPStream.cpp + $(CC) -O3 -std=c++11 -DOMP $^ $(OMP) -o $@ + +omp-target-stream: main.cpp OMPStream.cpp + $(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP) -o $@ + From 527a1edd29e7188cef8b23aa3a5f9c71449aaf9f Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:14:59 +0000 Subject: [PATCH 09/76] Rename Kokkos Makefiles --- KokkosCPUMakefile => KokkosCPU.make | 0 KokkosMakefile => KokkosGPU.make | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename KokkosCPUMakefile => KokkosCPU.make (100%) rename KokkosMakefile => KokkosGPU.make (100%) diff --git a/KokkosCPUMakefile b/KokkosCPU.make similarity index 100% rename from KokkosCPUMakefile rename to KokkosCPU.make diff --git a/KokkosMakefile b/KokkosGPU.make similarity index 100% rename from KokkosMakefile rename to KokkosGPU.make From 584e1b208bf85980d41be25b588b002653bd3862 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:22:14 +0000 Subject: [PATCH 10/76] Change binary name for Kokkos builds --- KokkosCPU.make | 9 +++++++-- KokkosGPU.make | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/KokkosCPU.make b/KokkosCPU.make index caa8b77..a15e2b9 100644 --- a/KokkosCPU.make +++ b/KokkosCPU.make @@ -1,10 +1,15 @@ -default: gpu-stream-kokkos +default: kokkos-cpu-stream include $(KOKKOS_PATH)/Makefile.kokkos -gpu-stream-kokkos: main.o KOKKOSStream.o +kokkos-cpu-stream: main.o KOKKOSStream.o $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3 %.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3 + +.PHONY: clean +clean: + rm -f main.o KOKKOSStream.o + diff --git a/KokkosGPU.make b/KokkosGPU.make index 1104cf4..041c007 100644 --- a/KokkosGPU.make +++ b/KokkosGPU.make @@ -1,11 +1,15 @@ -default: gpu-stream-kokkos +default: kokkos-gpu-stream include $(KOKKOS_PATH)/Makefile.kokkos -gpu-stream-kokkos: main.o KOKKOSStream.o +kokkos-gpu-stream: main.o KOKKOSStream.o $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3 %.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3 +.PHONY: clean +clean: + rm -f main.o KOKKOSStream.o + From 887ae9bf1b93c4525e1d8ebfe4fe40c7e54948e4 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 15:30:35 +0000 Subject: [PATCH 11/76] Add SYCL Makefile --- SYCL.make | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 SYCL.make diff --git a/SYCL.make b/SYCL.make new file mode 100644 index 0000000..5dde5c8 --- /dev/null +++ b/SYCL.make @@ -0,0 +1,11 @@ + +sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl + $(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl -lComputeCpp -lOpenCL -o $@ + + +SYCLStream.sycl: SYCLStream.cpp + compute++ SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c + +.PHONY: clean +clean: + rm -f SYCLStream.sycl From ce5fde973bbb7f606713499c1476130c10eb5ddc Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:31:38 +0000 Subject: [PATCH 12/76] Combine Kokkos CPU and GPU builds --- Kokkos.make | 28 ++++++++++++++++++++++++++++ KokkosCPU.make | 15 --------------- KokkosGPU.make | 15 --------------- 3 files changed, 28 insertions(+), 30 deletions(-) create mode 100644 Kokkos.make delete mode 100644 KokkosCPU.make delete mode 100644 KokkosGPU.make diff --git a/Kokkos.make b/Kokkos.make new file mode 100644 index 0000000..e30826c --- /dev/null +++ b/Kokkos.make @@ -0,0 +1,28 @@ + +default: kokkos-stream + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifndef TARGET +$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +TARGET=CPU +endif + +ifeq ($(TARGET), CPU) +COMPILER = $(CXX) +TARGET_DEF = -DKOKKOS_TARGET_CPU +else ifeq ($(TARGET), GPU) +COMPILER = $(NVCC_WRAPPER) +TARGET_DEF = +endif + +kokkos-stream: main.o KOKKOSStream.o + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 + +.PHONY: clean +clean: + rm -f main.o KOKKOSStream.o + diff --git a/KokkosCPU.make b/KokkosCPU.make deleted file mode 100644 index a15e2b9..0000000 --- a/KokkosCPU.make +++ /dev/null @@ -1,15 +0,0 @@ - -default: kokkos-cpu-stream - -include $(KOKKOS_PATH)/Makefile.kokkos - -kokkos-cpu-stream: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3 - -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3 - -.PHONY: clean -clean: - rm -f main.o KOKKOSStream.o - diff --git a/KokkosGPU.make b/KokkosGPU.make deleted file mode 100644 index 041c007..0000000 --- a/KokkosGPU.make +++ /dev/null @@ -1,15 +0,0 @@ - -default: kokkos-gpu-stream - -include $(KOKKOS_PATH)/Makefile.kokkos - -kokkos-gpu-stream: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3 - -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3 - -.PHONY: clean -clean: - rm -f main.o KOKKOSStream.o - From 07f91e605edf5f86de31b3e4c980b7c1ea33d7d0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:52:43 +0000 Subject: [PATCH 13/76] Add Makefile for RAJA --- RAJA.make | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 RAJA.make diff --git a/RAJA.make b/RAJA.make new file mode 100644 index 0000000..72cc5a6 --- /dev/null +++ b/RAJA.make @@ -0,0 +1,32 @@ + +ifndef TARGET +$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +TARGET=CPU +endif + +ifeq ($(TARGET), CPU) +COMP=$(CXX) +CFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU + +ifndef COMPILER +$(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.) +endif +ifeq ($(COMPILER), INTEL) +COMP = icpc +CFLAGS += -qopenmp +else ifeq ($(COMPILER), GNU) +COMP = g++ +CFLAGS += -fopenmp +else ifeq ($(COMPILER), CRAY) +COMP = CC +CFLAGS += +endif + +else ifeq ($(TARGET), GPU) +COMP = nvcc +CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp +endif + +raja-stream: main.cpp RAJAStream.cpp + $(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ -L$(RAJA_PATH)/lib -lRAJA -o $@ + From 1b573dbac31e162680a9d25a4755fe4bf372840b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 15:55:45 +0000 Subject: [PATCH 14/76] Add EXTRA_FLAGS option to build line for custom extra flags --- CUDA.make | 2 +- Kokkos.make | 4 ++-- OpenCL.make | 2 +- RAJA.make | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CUDA.make b/CUDA.make index 2434754..9a7ec88 100644 --- a/CUDA.make +++ b/CUDA.make @@ -1,4 +1,4 @@ cuda-stream: main.cpp CUDAStream.cu - nvcc -std=c++11 -O3 -DCUDA $^ -o $@ + nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@ diff --git a/Kokkos.make b/Kokkos.make index e30826c..78e2eaa 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -17,10 +17,10 @@ TARGET_DEF = endif kokkos-stream: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS) %.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 + $(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS) .PHONY: clean clean: diff --git a/OpenCL.make b/OpenCL.make index 447deca..f48be82 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -1,4 +1,4 @@ ocl-stream: main.cpp OCLStream.cpp - $(CXX) -O3 -std=c++11 -DOCL $^ -lOpenCL -o $@ + $(CXX) -O3 -std=c++11 -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ diff --git a/RAJA.make b/RAJA.make index 72cc5a6..0f9cd61 100644 --- a/RAJA.make +++ b/RAJA.make @@ -28,5 +28,5 @@ CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp endif raja-stream: main.cpp RAJAStream.cpp - $(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ -L$(RAJA_PATH)/lib -lRAJA -o $@ + $(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ From d3fa52748bc3d44065a76c494ee0e1a4ceb0ec09 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 15:52:28 +0000 Subject: [PATCH 15/76] Add Clang support to OpenMP.make --- OpenMP.make | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index 115c678..f111847 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -7,23 +7,30 @@ COMPILER_ = $(CXX) COMPILER_GNU = g++ COMPILER_INTEL = icpc COMPILER_CRAY = CC +COMPILER_CLANG = clang++ CC = $(COMPILER_$(COMPILER)) FLAGS_ = -O3 FLAGS_GNU = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 +FLAGS_CLANG = -O3 -std=c++11 CFLAGS = $(FLAGS_$(COMPILER)) -OMP_ = +OMP_ = OMP_GNU = -fopenmp OMP_INTEL = -qopenmp OMP_CRAY = +OMP_CLANG = -fopenmp=libomp OMP = $(OMP_$(COMPILER)) +OMP_TARGET_ = +OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda +OMP_TARGET = $(OMP_TARGET_$(COMPILER)) + omp-stream: main.cpp OMPStream.cpp - $(CC) -O3 -std=c++11 -DOMP $^ $(OMP) -o $@ + $(CC) -O3 -std=c++11 -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ omp-target-stream: main.cpp OMPStream.cpp - $(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP) -o $@ + $(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ From 6b26e33dcdc5883295f273eb1f1c5a64fe4f0f2c Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 15:53:12 +0000 Subject: [PATCH 16/76] Use $(CFLAGS) in OpenMP build rules --- OpenMP.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index f111847..9cb506a 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -29,8 +29,8 @@ OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda OMP_TARGET = $(OMP_TARGET_$(COMPILER)) omp-stream: main.cpp OMPStream.cpp - $(CC) -O3 -std=c++11 -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ + $(CC) $(CFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ omp-target-stream: main.cpp OMPStream.cpp - $(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ + $(CC) $(CFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ From d19cf1a5afb8410cd874a84fb081c3dd43994074 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 15:54:36 +0000 Subject: [PATCH 17/76] Rename CC and CFLAGS to CXX and CXXFLAGS in OpenMP --- OpenMP.make | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index 9cb506a..06d3d00 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -8,14 +8,14 @@ COMPILER_GNU = g++ COMPILER_INTEL = icpc COMPILER_CRAY = CC COMPILER_CLANG = clang++ -CC = $(COMPILER_$(COMPILER)) +CXX = $(COMPILER_$(COMPILER)) FLAGS_ = -O3 FLAGS_GNU = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 FLAGS_CLANG = -O3 -std=c++11 -CFLAGS = $(FLAGS_$(COMPILER)) +CXXFLAGS = $(FLAGS_$(COMPILER)) OMP_ = OMP_GNU = -fopenmp @@ -29,8 +29,8 @@ OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda OMP_TARGET = $(OMP_TARGET_$(COMPILER)) omp-stream: main.cpp OMPStream.cpp - $(CC) $(CFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ + $(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ omp-target-stream: main.cpp OMPStream.cpp - $(CC) $(CFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ + $(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ From 2d21f69eb74ff4277a2b271e894ed22fd58e69cb Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 15:56:39 +0000 Subject: [PATCH 18/76] Add OMP_TARGET_ definitions for other compilers --- OpenMP.make | 3 +++ 1 file changed, 3 insertions(+) diff --git a/OpenMP.make b/OpenMP.make index 06d3d00..bc4e955 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -25,6 +25,9 @@ OMP_CLANG = -fopenmp=libomp OMP = $(OMP_$(COMPILER)) OMP_TARGET_ = +OMP_TARGET_GNU = -fopenmp +OMP_TARGET_INTEL = +OMP_TARGET_CRAY = OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda OMP_TARGET = $(OMP_TARGET_$(COMPILER)) From f3e34a56073711c70d57e4cbdc9b8f1e9fd871ea Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:04:56 +0000 Subject: [PATCH 19/76] Add OpenACC Makefile --- OpenACC.make | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 OpenACC.make diff --git a/OpenACC.make b/OpenACC.make new file mode 100644 index 0000000..061a9e0 --- /dev/null +++ b/OpenACC.make @@ -0,0 +1,30 @@ + +ifndef COMPILER +$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +endif + +COMPILER_ = $(CXX) +COMPILER_PGI = pgc++ +COMPILER_CRAY = CC +CXX = $(COMPILER_$(COMPILER)) + +FLAGS_ = -O3 + +FLAGS_PGI = -std=c++11 -O3 -acc +ifeq ($(COMPILER), PGI) +ifndef TARGET +$(info Set a TARGET to ensure PGI targets the correct offload device. i.e. TARGET=GPU or CPU) +endif +endif +ifeq ($(TARGET), GPU) +FLAGS_PGI += -ta=nvidia +else ifeq ($(TARGET), CPU) +FLAGS_PGI += -ta=multicore +endif + +FLAGS_CRAY = -hstd=c++11 +CFLAGS = $(FLAGS_$(COMPILER)) + +acc-stream: main.cpp ACCStream.cpp + $(CXX) $(CFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ + From 488db0eae613d67ee3c9e514c0bded587d5729a5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:19:04 +0000 Subject: [PATCH 20/76] Remove CMake module --- cmake/Modules/FindComputeCpp.cmake | 248 ----------------------------- 1 file changed, 248 deletions(-) delete mode 100644 cmake/Modules/FindComputeCpp.cmake diff --git a/cmake/Modules/FindComputeCpp.cmake b/cmake/Modules/FindComputeCpp.cmake deleted file mode 100644 index 5e08902..0000000 --- a/cmake/Modules/FindComputeCpp.cmake +++ /dev/null @@ -1,248 +0,0 @@ -#.rst: -# FindComputeCpp -#--------------- -# -# Copyright 2016 Codeplay Software Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use these files except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -######################### -# FindComputeCpp.cmake -######################### -# -# Tools for finding and building with ComputeCpp. -# -# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp -# installation. -# -# Latest version of this file can be found at: -# https://github.com/codeplaysoftware/computecpp-sdk - -# Require CMake version 3.2.2 or higher -cmake_minimum_required(VERSION 3.2.2) - -# Check that a supported host compiler can be found -if(CMAKE_COMPILER_IS_GNUCXX) - # Require at least gcc 4.8 - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - message(FATAL_ERROR - "host compiler - Not found! (gcc version must be at least 4.8)") - # Require the GCC dual ABI to be disabled for 5.1 or higher - elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1) - set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") - message(STATUS - "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)") - else() - message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") - endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # Require at least clang 3.6 - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) - message(FATAL_ERROR - "host compiler - Not found! (clang version must be at least 3.6)") - else() - message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") - endif() -else() - message(WARNING - "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)") -endif() - -set(COMPUTECPP_64_BIT_DEFAULT ON) -option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" - ${COMPUTECPP_64_BIT_DEFAULT}) -mark_as_advanced(COMPUTECPP_64_BIT_CODE) - -# Find OpenCL package -find_package(OpenCL REQUIRED) - -# Find ComputeCpp packagee -if(NOT COMPUTECPP_PACKAGE_ROOT_DIR) - message(FATAL_ERROR - "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR") -else() - message(STATUS "ComputeCpp package - Found") -endif() -option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package") - -# Obtain the path to compute++ -find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_DEVICE_COMPILER}) - mark_as_advanced(COMPUTECPP_DEVICE_COMPILER) - message(STATUS "compute++ - Found") -else() - message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})") -endif() - -# Obtain the path to computecpp_info -find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_INFO_TOOL}) - mark_as_advanced(${COMPUTECPP_INFO_TOOL}) - message(STATUS "computecpp_info - Found") -else() - message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})") -endif() - -# Obtain the path to the ComputeCpp runtime library -find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR} - HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib - DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH) - -if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY}) - mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY) - message(STATUS "libComputeCpp.so - Found") -else() - message(FATAL_ERROR "libComputeCpp.so - Not found!") -endif() - -# Obtain the ComputeCpp include directory -set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/) -if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY}) - message(FATAL_ERROR "ComputeCpp includes - Not found!") -else() - message(STATUS "ComputeCpp includes - Found") -endif() - -# Obtain the package version -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version" - OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "Package version - Error obtaining version!") -else() - mark_as_advanced(COMPUTECPP_PACKAGE_VERSION) - message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}") -endif() - -# Obtain the device compiler flags -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags" - OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!") -else() - mark_as_advanced(COMPUTECPP_COMPILER_FLAGS) - message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") -endif() - -# Check if the platform is supported -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" - OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "platform - Error checking platform support!") -else() - mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) - if (COMPUTECPP_PLATFORM_IS_SUPPORTED) - message(STATUS "platform - your system can support ComputeCpp") - else() - message(STATUS "platform - your system CANNOT support ComputeCpp") - endif() -endif() - -#################### -# __build_sycl -#################### -# -# Adds a custom target for running compute++ and adding a dependency for the -# resulting integration header. -# -# targetName : Name of the target. -# sourceFile : Source file to be compiled. -# binaryDir : Intermediate directory to output the integration header. -# -function(__build_spir targetName sourceFile binaryDir) - - # Retrieve source file name. - get_filename_component(sourceFileName ${sourceFile} NAME) - - # Set the path to the Sycl file. - set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) - - # Add any user-defined include to the device compiler - get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY - INCLUDE_DIRECTORIES) - set(device_compiler_includes "") - foreach(directory ${includeDirectories}) - set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) - endforeach() - if (CMAKE_INCLUDE_PATH) - foreach(directory ${CMAKE_INCLUDE_PATH}) - set(device_compiler_includes "-I${directory}" - ${device_compiler_includes}) - endforeach() - endif() - - # Convert argument list format - separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) - - # Add custom command for running compute++ - add_custom_command( - OUTPUT ${outputSyclFile} - COMMAND ${COMPUTECPP_DEVICE_COMPILER} - ${COMPUTECPP_DEVICE_COMPILER_FLAGS} - -isystem ${COMPUTECPP_INCLUDE_DIRECTORY} - ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS} - ${device_compiler_includes} - -o ${outputSyclFile} - -c ${sourceFile} - DEPENDS ${sourceFile} - WORKING_DIRECTORY ${binaryDir} - COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") - - # Add a custom target for the generated integration header - add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) - - # Add a dependency on the integration header - add_dependencies(${targetName} ${targetName}_integration_header) - - # Force inclusion of the integration header for the host compiler - set(compileFlags -include ${outputSyclFile} "-Wall") - target_compile_options(${targetName} PUBLIC ${compileFlags}) - - # Set the host compiler C++ standard to C++11 - set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11) - - # Disable GCC dual ABI on GCC 5.1 and higher - if(COMPUTECPP_DISABLE_GCC_DUAL_ABI) - set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS - "_GLIBCXX_USE_CXX11_ABI=0") - endif() - -endfunction() - -####################### -# add_sycl_to_target -####################### -# -# Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. -# -# targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. -# binaryDir : Intermediate directory to output the integration header. -# -function(add_sycl_to_target targetName sourceFile binaryDir) - - # Add custom target to run compute++ and generate the integration header - __build_spir(${targetName} ${sourceFile} ${binaryDir}) - - # Link with the ComputeCpp runtime library - target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} - PUBLIC ${OpenCL_LIBRARIES}) - -endfunction(add_sycl_to_target) - From e926cfd3bb107b83b4b6d271bda9c0c96eeb39e8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:20:55 +0000 Subject: [PATCH 21/76] Remove CMake from gitignore --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6ef20a5..b73a688 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,4 @@ gpu-stream-sycl .DS_Store -CMakeCache.txt -CMakeFiles/ -cmake_install.cmake Makefile From 472434d2fce01ffbffeb08438a2dc6c438519750 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:21:50 +0000 Subject: [PATCH 22/76] Update gitignore binary names --- .gitignore | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index b73a688..b239bb2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ -gpu-stream-cuda -gpu-stream-ocl -gpu-stream-acc -gpu-stream-omp3 -gpu-stream-omp45 -gpu-stream-sycl +cuda-stream +ocl-stream +omp-stream +acc-stream +raja-stream +kokkos-stream +sycl-stream *.o *.tar From 0cbf8aa649bf9601a58b2054b93b0a410d79c042 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:32:17 +0000 Subject: [PATCH 23/76] Add COMPILER option to OpenCL Makefile --- OpenCL.make | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/OpenCL.make b/OpenCL.make index f48be82..4c73b16 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -1,4 +1,17 @@ +ifndef COMPILER +$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +endif + +COMPILER_ = $(CXX) +COMPILER_GNU = g++ +COMPILER_CRAY = CC + +FLAGS_ = -O3 +FLAGS_GNU = -O3 -std=c++11 +FLAGS_CRAY = -O3 -hstd=c++11 +CFLAGS=$(FLAGS_$(COMPILER)) + ocl-stream: main.cpp OCLStream.cpp - $(CXX) -O3 -std=c++11 -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ + $(COMPILER_$(COMPILER)) $(CFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ From 1c31eedd8b29f87c968d407dc918fa4315147bf7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:41:44 +0000 Subject: [PATCH 24/76] Update README --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0d181d5..c55bcd3 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,16 @@ Website Usage ----- -CMake 3.2 or above is required. -Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports. +Drivers, compiler and software applicable to whichever implementation you would like to build against is required. -Generate the Makefile with `cmake .` +We have supplied a series of Makefiles, one for each programming model, to assist with building. +The Makefiles contain common build options, and should be simple to customise for your needs too. + +General usage is `make -f .make +Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`. +Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`. + +Pass in extra flags via the `EXTRA_FLAGS` option. Android (outdated instructions) ------------------ From 67411686543e24f2b5d36ab9a4071c09733e47f6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:42:51 +0000 Subject: [PATCH 25/76] Fix README typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c55bcd3..c11f694 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Drivers, compiler and software applicable to whichever implementation you would We have supplied a series of Makefiles, one for each programming model, to assist with building. The Makefiles contain common build options, and should be simple to customise for your needs too. -General usage is `make -f .make +General usage is `make -f .make` Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`. Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`. From 2989419cb57881186cbfb448bf1cf4eff495da20 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:43:56 +0000 Subject: [PATCH 26/76] Update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c11f694..552051e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ Some models allow specifying a CPU or GPU style target, and this can be set by p Pass in extra flags via the `EXTRA_FLAGS` option. +The binaries are named in the form `-stream`. + + Android (outdated instructions) ------------------ From e6ac8076cf2be6fcf3bd28bce59a48f3d7913096 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 23 Feb 2017 16:45:55 +0000 Subject: [PATCH 27/76] Move android instructions to seperate file --- README.android | 36 ++++++++++++++++++++++++++++++++++++ README.md | 36 ------------------------------------ 2 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 README.android diff --git a/README.android b/README.android new file mode 100644 index 0000000..edc4a52 --- /dev/null +++ b/README.android @@ -0,0 +1,36 @@ +Android (outdated instructions) +------------------ + +Assuming you have a recent Android NDK available, you can use the +toolchain that it provides to build GPU-STREAM. You should first +use the NDK to generate a standalone toolchain: + + # Select a directory to install the toolchain to + ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain + + ${NDK}/build/tools/make-standalone-toolchain.sh \ + --platform=android-14 \ + --toolchain=arm-linux-androideabi-4.8 \ + --install-dir=${ANDROID_NATIVE_TOOLCHAIN} + +Make sure that the OpenCL headers and library (libOpenCL.so) are +available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`. + +You should then be able to build GPU-STREAM: + + make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++ + +Copy the executable and OpenCL kernels to the device: + + adb push gpu-stream-ocl /data/local/tmp + adb push ocl-stream-kernels.cl /data/local/tmp + +Run GPU-STREAM from an adb shell: + + adb shell + cd /data/local/tmp + + # Use float if device doesn't support double, and reduce array size + ./gpu-stream-ocl --float -n 6 -s 10000000 + + diff --git a/README.md b/README.md index 0d181d5..df0f23c 100644 --- a/README.md +++ b/README.md @@ -28,42 +28,6 @@ Drivers, compiler and software applicable to whichever implementation you would Generate the Makefile with `cmake .` -Android (outdated instructions) ------------------- - -Assuming you have a recent Android NDK available, you can use the -toolchain that it provides to build GPU-STREAM. You should first -use the NDK to generate a standalone toolchain: - - # Select a directory to install the toolchain to - ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain - - ${NDK}/build/tools/make-standalone-toolchain.sh \ - --platform=android-14 \ - --toolchain=arm-linux-androideabi-4.8 \ - --install-dir=${ANDROID_NATIVE_TOOLCHAIN} - -Make sure that the OpenCL headers and library (libOpenCL.so) are -available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`. - -You should then be able to build GPU-STREAM: - - make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++ - -Copy the executable and OpenCL kernels to the device: - - adb push gpu-stream-ocl /data/local/tmp - adb push ocl-stream-kernels.cl /data/local/tmp - -Run GPU-STREAM from an adb shell: - - adb shell - cd /data/local/tmp - - # Use float if device doesn't support double, and reduce array size - ./gpu-stream-ocl --float -n 6 -s 10000000 - - Results ------- From f3abd6657872e28e949e3c6b57d8409d210dbaf6 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 17:24:14 +0000 Subject: [PATCH 28/76] Make Kokkos build without intermediate objects --- Kokkos.make | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Kokkos.make b/Kokkos.make index 78e2eaa..f0b4528 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -9,20 +9,16 @@ TARGET=CPU endif ifeq ($(TARGET), CPU) -COMPILER = $(CXX) TARGET_DEF = -DKOKKOS_TARGET_CPU else ifeq ($(TARGET), GPU) -COMPILER = $(NVCC_WRAPPER) +CXX = $(NVCC_WRAPPER) TARGET_DEF = endif -kokkos-stream: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS) - -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS) +kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS) .PHONY: clean clean: - rm -f main.o KOKKOSStream.o + rm -f kokkos-stream From cdea91abc66436f1ab317f3414177b577612371e Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 17:49:46 +0000 Subject: [PATCH 29/76] Add required ARCH flag for RAJA on GPUs --- RAJA.make | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/RAJA.make b/RAJA.make index 0f9cd61..5851e4f 100644 --- a/RAJA.make +++ b/RAJA.make @@ -24,7 +24,11 @@ endif else ifeq ($(TARGET), GPU) COMP = nvcc -CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp + +ifndef ARCH +$(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35)) +endif +CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH) endif raja-stream: main.cpp RAJAStream.cpp From 063446b6a79d4527f5dcbb9a114966171b0ae9d6 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 18:15:27 +0000 Subject: [PATCH 30/76] Add EXTRA_FLAGS to SYCL.make --- SYCL.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL.make b/SYCL.make index 5dde5c8..02f9442 100644 --- a/SYCL.make +++ b/SYCL.make @@ -1,6 +1,6 @@ sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl - $(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl -lComputeCpp -lOpenCL -o $@ + $(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@ SYCLStream.sycl: SYCLStream.cpp From d78baa4c2bb580979139285ec18bca8b80dfffd1 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 18:39:40 +0000 Subject: [PATCH 31/76] Use CXXFLAGS everywhere for consistency --- OpenACC.make | 4 ++-- OpenCL.make | 4 ++-- RAJA.make | 13 ++++++------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/OpenACC.make b/OpenACC.make index 061a9e0..161c20e 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -23,8 +23,8 @@ FLAGS_PGI += -ta=multicore endif FLAGS_CRAY = -hstd=c++11 -CFLAGS = $(FLAGS_$(COMPILER)) +CXXFLAGS = $(FLAGS_$(COMPILER)) acc-stream: main.cpp ACCStream.cpp - $(CXX) $(CFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ + $(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ diff --git a/OpenCL.make b/OpenCL.make index 4c73b16..ce7db6e 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -10,8 +10,8 @@ COMPILER_CRAY = CC FLAGS_ = -O3 FLAGS_GNU = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 -CFLAGS=$(FLAGS_$(COMPILER)) +CXXFLAGS=$(FLAGS_$(COMPILER)) ocl-stream: main.cpp OCLStream.cpp - $(COMPILER_$(COMPILER)) $(CFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ + $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ diff --git a/RAJA.make b/RAJA.make index 5851e4f..3f456cb 100644 --- a/RAJA.make +++ b/RAJA.make @@ -6,20 +6,20 @@ endif ifeq ($(TARGET), CPU) COMP=$(CXX) -CFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU +CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU ifndef COMPILER $(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.) endif ifeq ($(COMPILER), INTEL) COMP = icpc -CFLAGS += -qopenmp +CXXFLAGS += -qopenmp else ifeq ($(COMPILER), GNU) COMP = g++ -CFLAGS += -fopenmp +CXXFLAGS += -fopenmp else ifeq ($(COMPILER), CRAY) COMP = CC -CFLAGS += +CXXFLAGS += endif else ifeq ($(TARGET), GPU) @@ -28,9 +28,8 @@ COMP = nvcc ifndef ARCH $(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35)) endif -CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH) +CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH) endif raja-stream: main.cpp RAJAStream.cpp - $(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ - + $(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ From ce4f49e08b37e71fe56b5d91d3f439e7afdeb6d9 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 19:08:25 +0000 Subject: [PATCH 32/76] Add dot kernel to HIP implementation --- HIPStream.cu | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ HIPStream.h | 6 ++++++ 2 files changed, 55 insertions(+) diff --git a/HIPStream.cu b/HIPStream.cu index 8c02348..d14fe84 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -9,6 +9,7 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 +#define DOT_NUM_BLOCKS 256 void check_error(void) { @@ -47,6 +48,9 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); @@ -60,6 +64,8 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); + hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + check_error(); } @@ -172,6 +178,49 @@ void HIPStream::triad() } +template +__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size) +{ + + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T *tb_sum = reinterpret_cast(smem); + + int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = 0.0; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T HIPStream::dot() +{ + hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), sizeof(T)*TBSIZE, 0, d_a, d_b, d_sum, array_size); + check_error(); + + hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + check_error(); + + T sum = 0.0; + for (int i = 0; i < DOT_NUM_BLOCKS; i++) + sum += sums[i]; + + return sum; +} + void listDevices(void) { // Get number of devices diff --git a/HIPStream.h b/HIPStream.h index 392080a..9209388 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -21,10 +21,15 @@ class HIPStream : public Stream protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + T *sums; + // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + T *d_sum; public: @@ -36,6 +41,7 @@ class HIPStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 13f5c837f690ed684d5bfce5bac536388b9c22ac Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 19:19:47 +0000 Subject: [PATCH 33/76] Add HIP.make with CUDA support Still needs additional work for HIP via HCC. --- .gitignore | 1 + HIP.make | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 HIP.make diff --git a/.gitignore b/.gitignore index b239bb2..5dad665 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ acc-stream raja-stream kokkos-stream sycl-stream +hip-stream *.o *.tar diff --git a/HIP.make b/HIP.make new file mode 100644 index 0000000..25460fe --- /dev/null +++ b/HIP.make @@ -0,0 +1,13 @@ + +# TODO: HIP with HCC + +HIPCC = hipcc + +ifndef CUDA_PATH +ifeq (,$(wildcard /usr/local/bin/nvcc)) +$(error /usr/local/bin/nvcc not found, set CUDA_PATH instead) +endif +endif + +hip-stream: main.cpp HIPStream.cu + $(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@ From b62c93527da6c436e79dc4c76d05b7528e729107 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 19:26:55 +0000 Subject: [PATCH 34/76] Use computecpp_info for SYCL device compiler flags --- SYCL.make | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/SYCL.make b/SYCL.make index 02f9442..1e1c480 100644 --- a/SYCL.make +++ b/SYCL.make @@ -1,11 +1,12 @@ +COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags) + sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl $(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@ - SYCLStream.sycl: SYCLStream.cpp - compute++ SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c + compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c .PHONY: clean clean: - rm -f SYCLStream.sycl + rm -f sycl-stream SYCLStream.sycl SYCLStream.bc From b485e77ef30190bb7c4ce4eaefcb8ae1464ff769 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 23 Feb 2017 19:38:14 +0000 Subject: [PATCH 35/76] Add SYCL intermediate outputs to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5dad665..a9748b3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ sycl-stream hip-stream *.o +*.bc +*.sycl *.tar *.gz From e1ea01f0c2807ac4717e87bfd80f21c029856fe0 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 11:02:50 +0000 Subject: [PATCH 36/76] Add -xHOST to Intel for OpenMP --- OpenMP.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenMP.make b/OpenMP.make index bc4e955..d4b7519 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -12,7 +12,7 @@ CXX = $(COMPILER_$(COMPILER)) FLAGS_ = -O3 FLAGS_GNU = -O3 -std=c++11 -FLAGS_INTEL = -O3 -std=c++11 +FLAGS_INTEL = -O3 -std=c++11 -xHOST FLAGS_CRAY = -O3 -hstd=c++11 FLAGS_CLANG = -O3 -std=c++11 CXXFLAGS = $(FLAGS_$(COMPILER)) From cbf15e8954af4d5d1e1d0e6647f60cafff28213d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 11:50:25 +0000 Subject: [PATCH 37/76] Have -std=c++11 as default flags when COMPILER is not set for ACC, CL and MP --- OpenACC.make | 2 +- OpenCL.make | 2 +- OpenMP.make | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/OpenACC.make b/OpenACC.make index 161c20e..2799603 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -8,7 +8,7 @@ COMPILER_PGI = pgc++ COMPILER_CRAY = CC CXX = $(COMPILER_$(COMPILER)) -FLAGS_ = -O3 +FLAGS_ = -O3 -std=c++11 FLAGS_PGI = -std=c++11 -O3 -acc ifeq ($(COMPILER), PGI) diff --git a/OpenCL.make b/OpenCL.make index ce7db6e..62c4f43 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -7,7 +7,7 @@ COMPILER_ = $(CXX) COMPILER_GNU = g++ COMPILER_CRAY = CC -FLAGS_ = -O3 +FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS=$(FLAGS_$(COMPILER)) diff --git a/OpenMP.make b/OpenMP.make index d4b7519..2fd47e7 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -10,7 +10,7 @@ COMPILER_CRAY = CC COMPILER_CLANG = clang++ CXX = $(COMPILER_$(COMPILER)) -FLAGS_ = -O3 +FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 -xHOST FLAGS_CRAY = -O3 -hstd=c++11 From 761ba2dce0e8318f46b4b8357f5427da3b0082b1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 11:57:34 +0000 Subject: [PATCH 38/76] Add a clean option to all Makefiles --- CUDA.make | 4 ++++ HIP.make | 5 +++++ OpenACC.make | 4 ++++ OpenCL.make | 4 ++++ OpenMP.make | 5 +++++ RAJA.make | 7 ++++++- 6 files changed, 28 insertions(+), 1 deletion(-) diff --git a/CUDA.make b/CUDA.make index 9a7ec88..ef193df 100644 --- a/CUDA.make +++ b/CUDA.make @@ -2,3 +2,7 @@ cuda-stream: main.cpp CUDAStream.cu nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@ +.PHONY: clean +clean: + rm -f cuda-stream + diff --git a/HIP.make b/HIP.make index 25460fe..bbd142e 100644 --- a/HIP.make +++ b/HIP.make @@ -11,3 +11,8 @@ endif hip-stream: main.cpp HIPStream.cu $(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@ + +.PHONY: clean +clean: + rm -f hip-stream + diff --git a/OpenACC.make b/OpenACC.make index 2799603..d9fbc36 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -28,3 +28,7 @@ CXXFLAGS = $(FLAGS_$(COMPILER)) acc-stream: main.cpp ACCStream.cpp $(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ +.PHONY: clean +clean: + rm -f acc-stream + diff --git a/OpenCL.make b/OpenCL.make index 62c4f43..0075424 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -15,3 +15,7 @@ CXXFLAGS=$(FLAGS_$(COMPILER)) ocl-stream: main.cpp OCLStream.cpp $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ +.PHONY: clean +clean: + rm -f ocl-stream + diff --git a/OpenMP.make b/OpenMP.make index 2fd47e7..0757f0d 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -37,3 +37,8 @@ omp-stream: main.cpp OMPStream.cpp omp-target-stream: main.cpp OMPStream.cpp $(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ +.PHONY: clean +clean: + rm -f omp-stream + rm -f omp-target-stream + diff --git a/RAJA.make b/RAJA.make index 3f456cb..bcfbba7 100644 --- a/RAJA.make +++ b/RAJA.make @@ -9,7 +9,7 @@ COMP=$(CXX) CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU ifndef COMPILER -$(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.) +$(info No COMPILER defined. Specify COMPILER for correct OpenMP flag.) endif ifeq ($(COMPILER), INTEL) COMP = icpc @@ -33,3 +33,8 @@ endif raja-stream: main.cpp RAJAStream.cpp $(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ + +.PHONY: clean +clean: + rm -f raja-stream + From 10b2376634e512f9e0ef06ff6b0c39de4a5cddca Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 12:24:35 +0000 Subject: [PATCH 39/76] Add TARGET option to OpenMP Makefile --- OpenMP.make | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index 0757f0d..cbcca54 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -3,6 +3,11 @@ ifndef COMPILER $(info Define a compiler to set common defaults, i.e make COMPILER=GNU) endif +ifndef TARGET +$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +TARGET=CPU +endif + COMPILER_ = $(CXX) COMPILER_GNU = g++ COMPILER_INTEL = icpc @@ -31,14 +36,17 @@ OMP_TARGET_CRAY = OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda OMP_TARGET = $(OMP_TARGET_$(COMPILER)) +ifeq ($(TARGET), CPU) +OMP = $(OMP_$(COMPILER)) +else ifeq ($(TARGET), GPU) +OMP = $(OMP_TARGET_$(COMPILER)) +OMP += -DOMP_TARGET_GPU +endif + omp-stream: main.cpp OMPStream.cpp $(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ -omp-target-stream: main.cpp OMPStream.cpp - $(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@ - .PHONY: clean clean: rm -f omp-stream - rm -f omp-target-stream From 282251f0261d2f543f075ccc2bc36807e715c98c Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 12:48:00 +0000 Subject: [PATCH 40/76] Add options for specific targets in OpenACC.make --- OpenACC.make | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/OpenACC.make b/OpenACC.make index 161c20e..af27481 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -6,25 +6,41 @@ endif COMPILER_ = $(CXX) COMPILER_PGI = pgc++ COMPILER_CRAY = CC -CXX = $(COMPILER_$(COMPILER)) FLAGS_ = -O3 FLAGS_PGI = -std=c++11 -O3 -acc ifeq ($(COMPILER), PGI) +define target_help +Set a TARGET to ensure PGI targets the correct offload device. +Available targets are: + SNB, IVB, HSW, BDW, KNL + KEPLER, MAXWELL, PASCAL + HAWAII +endef ifndef TARGET -$(info Set a TARGET to ensure PGI targets the correct offload device. i.e. TARGET=GPU or CPU) +$(error $(target_help)) endif +TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge +TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge +TARGET_FLAGS_HSW = -ta=multicore -tp=haswell +TARGET_FLAGS_BDW = -ta=multicore -tp=haswell # Not yet supported directly +TARGET_FLAGS_KNL = -ta=multicore -tp=haswell # Not yet supported directly +TARGET_FLAGS_KEPLER = -ta=nvidia:cc35 +TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50 +TARGET_FLAGS_PASCAL = -ta=nvidia:cc60 +TARGET_FLAGS_HAWAII = -ta=radeon:hawaii +ifeq ($(TARGET_FLAGS_$(TARGET)),) +$(error $(target_help)) endif -ifeq ($(TARGET), GPU) -FLAGS_PGI += -ta=nvidia -else ifeq ($(TARGET), CPU) -FLAGS_PGI += -ta=multicore + +FLAGS_PGI += $(TARGET_FLAGS_$(TARGET)) + endif FLAGS_CRAY = -hstd=c++11 CXXFLAGS = $(FLAGS_$(COMPILER)) acc-stream: main.cpp ACCStream.cpp - $(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ + $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@ From 91131baf3d5777b13e4cd50307464fa863fde227 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 12:52:13 +0000 Subject: [PATCH 41/76] Remove BDW and KNL targets from OpenACC as not yet supported by PGI --- OpenACC.make | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/OpenACC.make b/OpenACC.make index 6fecf1c..9ec5c4d 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -14,7 +14,7 @@ ifeq ($(COMPILER), PGI) define target_help Set a TARGET to ensure PGI targets the correct offload device. Available targets are: - SNB, IVB, HSW, BDW, KNL + SNB, IVB, HSW KEPLER, MAXWELL, PASCAL HAWAII endef @@ -24,8 +24,6 @@ endif TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge TARGET_FLAGS_HSW = -ta=multicore -tp=haswell -TARGET_FLAGS_BDW = -ta=multicore -tp=haswell # Not yet supported directly -TARGET_FLAGS_KNL = -ta=multicore -tp=haswell # Not yet supported directly TARGET_FLAGS_KEPLER = -ta=nvidia:cc35 TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50 TARGET_FLAGS_PASCAL = -ta=nvidia:cc60 From bbdd5b9fcba3caf81a1b64387c85034761925abb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 13:07:35 +0000 Subject: [PATCH 42/76] Add help message to Kokkos TARGET variable --- Kokkos.make | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Kokkos.make b/Kokkos.make index f0b4528..4bba1c8 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -4,7 +4,13 @@ default: kokkos-stream include $(KOKKOS_PATH)/Makefile.kokkos ifndef TARGET -$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +define target_help +Set TARGET to change to offload device. Defaulting to CPU. +Available targets are: + CPU (default) + GPU +endef +$(info $(target_help)) TARGET=CPU endif From 3be4ebc1a2e88c2ce95be330449b84956632fb3e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 13:11:07 +0000 Subject: [PATCH 43/76] Add help messages to RAJA Makefile --- RAJA.make | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/RAJA.make b/RAJA.make index bcfbba7..bba45f5 100644 --- a/RAJA.make +++ b/RAJA.make @@ -1,6 +1,12 @@ ifndef TARGET -$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +define target_help +Set TARGET to change to offload device. Defaulting to CPU. +Available targets are: + CPU (default) + GPU +endef +$(info $(target_help)) TARGET=CPU endif @@ -9,7 +15,12 @@ COMP=$(CXX) CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU ifndef COMPILER -$(info No COMPILER defined. Specify COMPILER for correct OpenMP flag.) +define compiler_help +Set COMPILER to ensure correct OpenMP flags are set. +Available compilers are: + INTEL GNU CRAY +endef +$(info $(compiler_help)) endif ifeq ($(COMPILER), INTEL) COMP = icpc @@ -26,7 +37,12 @@ else ifeq ($(TARGET), GPU) COMP = nvcc ifndef ARCH -$(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35)) +define arch_help +Set ARCH to ensure correct GPU architecture. +Example: + ARCH=sm_35 +endef +$(error $(arch_help)) endif CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH) endif From c470b88deede18ece614c09420df5070570ee7be Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 13:13:08 +0000 Subject: [PATCH 44/76] Add compiler help text to OpenACC --- OpenACC.make | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/OpenACC.make b/OpenACC.make index 9ec5c4d..504f069 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -1,6 +1,11 @@ ifndef COMPILER -$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +define compiler_help +Set COMPILER to ensure correct flags are set. +Available compilers are: + PGI CRAY +endef +$(info $(compiler_help)) endif COMPILER_ = $(CXX) From 6008f8c5364907ce567dfffcc068f362b27a0c11 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 13:03:06 +0000 Subject: [PATCH 45/76] Add intermediate objects to OpenACC clean rule PGI creates these, even though we don't ask for them. --- OpenACC.make | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/OpenACC.make b/OpenACC.make index 504f069..cc61f8e 100644 --- a/OpenACC.make +++ b/OpenACC.make @@ -49,5 +49,4 @@ acc-stream: main.cpp ACCStream.cpp .PHONY: clean clean: - rm -f acc-stream - + rm -f acc-stream main.o ACCStream.o From 82de8188558a0792d29f35c9c7255464937ee614 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 13:08:20 +0000 Subject: [PATCH 46/76] Add support for Intel as host compiler for OpenCL --- OpenCL.make | 2 ++ 1 file changed, 2 insertions(+) diff --git a/OpenCL.make b/OpenCL.make index 0075424..f43ebfa 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -5,10 +5,12 @@ endif COMPILER_ = $(CXX) COMPILER_GNU = g++ +COMPILER_INTEL = icpc COMPILER_CRAY = CC FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 +FLAGS_INTEL = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS=$(FLAGS_$(COMPILER)) From 8fee86a23230231dc2de5143c1c6b5c46dc6576a Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 13:17:12 +0000 Subject: [PATCH 47/76] Add compiler help to OpenCL.make --- OpenCL.make | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/OpenCL.make b/OpenCL.make index f43ebfa..244398a 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -1,15 +1,22 @@ ifndef COMPILER -$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +define compiler_help +Set COMPILER to ensure correct flags are set. +Available compilers are: + GNU CLANG INTEL CRAY +endef +$(info $(compiler_help)) endif COMPILER_ = $(CXX) COMPILER_GNU = g++ +COMPILER_CLANG = clang++ COMPILER_INTEL = icpc COMPILER_CRAY = CC FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 +FLAGS_CLANG = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS=$(FLAGS_$(COMPILER)) From 1aec057e48b27286beb9adf94c0a85e630feeaea Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 13:32:59 +0000 Subject: [PATCH 48/76] Add help messages to OpenMP.make and refactor --- OpenMP.make | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index cbcca54..4c2fe07 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -1,52 +1,57 @@ ifndef COMPILER -$(info Define a compiler to set common defaults, i.e make COMPILER=GNU) +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + CLANG CRAY GNU INTEL + +endef +$(info $(compiler_help)) +COMPILER=GNU endif ifndef TARGET -$(info No target defined. Specify CPU or GPU. Defaulting to CPU) +define target_help +Set TARGET to change device (defaulting to CPU). +Available targets are: + CPU NVIDIA + +endef +$(info $(target_help)) TARGET=CPU endif -COMPILER_ = $(CXX) COMPILER_GNU = g++ COMPILER_INTEL = icpc COMPILER_CRAY = CC COMPILER_CLANG = clang++ CXX = $(COMPILER_$(COMPILER)) -FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 -xHOST FLAGS_CRAY = -O3 -hstd=c++11 FLAGS_CLANG = -O3 -std=c++11 CXXFLAGS = $(FLAGS_$(COMPILER)) -OMP_ = -OMP_GNU = -fopenmp -OMP_INTEL = -qopenmp -OMP_CRAY = -OMP_CLANG = -fopenmp=libomp -OMP = $(OMP_$(COMPILER)) +# OpenMP flags for CPUs +OMP_GNU_CPU = -fopenmp +OMP_INTEL_CPU = -qopenmp +OMP_CRAY_CPU = +OMP_CLANG_CPU = -fopenmp=libomp -OMP_TARGET_ = -OMP_TARGET_GNU = -fopenmp -OMP_TARGET_INTEL = -OMP_TARGET_CRAY = -OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -OMP_TARGET = $(OMP_TARGET_$(COMPILER)) +# OpenMP flags for NVIDIA +OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU +OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -ifeq ($(TARGET), CPU) -OMP = $(OMP_$(COMPILER)) -else ifeq ($(TARGET), GPU) -OMP = $(OMP_TARGET_$(COMPILER)) -OMP += -DOMP_TARGET_GPU +ifndef OMP_$(COMPILER)_$(TARGET) +$(error Targeting $(TARGET) with $(COMPILER) not supported) endif +OMP = $(OMP_$(COMPILER)_$(TARGET)) + omp-stream: main.cpp OMPStream.cpp $(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@ .PHONY: clean clean: rm -f omp-stream - From a7d7998326b6cface59803195941758f92e942e4 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 13:40:54 +0000 Subject: [PATCH 49/76] Use -framework OpenCL on Darwin --- OpenCL.make | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/OpenCL.make b/OpenCL.make index 244398a..484921d 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -21,8 +21,15 @@ FLAGS_INTEL = -O3 -std=c++11 FLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS=$(FLAGS_$(COMPILER)) +PLATFORM = $(shell uname -s) +ifeq ($(PLATFORM), Darwin) + LIBS = -framework OpenCL +else + LIBS = -lOpenCL +endif + ocl-stream: main.cpp OCLStream.cpp - $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@ + $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@ .PHONY: clean clean: From 569cfa1d31a3a2a4af089514494b35b54419c609 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 09:02:51 -0600 Subject: [PATCH 50/76] Make Cray OpenMP flag non-empty to fix error --- OpenMP.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenMP.make b/OpenMP.make index 4c2fe07..32f46ca 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -36,7 +36,7 @@ CXXFLAGS = $(FLAGS_$(COMPILER)) # OpenMP flags for CPUs OMP_GNU_CPU = -fopenmp OMP_INTEL_CPU = -qopenmp -OMP_CRAY_CPU = +OMP_CRAY_CPU = -homp OMP_CLANG_CPU = -fopenmp=libomp # OpenMP flags for NVIDIA From dfe5503cba4bf2b83c2bcf87f9a339ff4f625a99 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 09:33:59 -0600 Subject: [PATCH 51/76] Allow user to override CXX in OpenCL.make --- OpenCL.make | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/OpenCL.make b/OpenCL.make index 484921d..8ad7108 100644 --- a/OpenCL.make +++ b/OpenCL.make @@ -1,18 +1,20 @@ ifndef COMPILER define compiler_help -Set COMPILER to ensure correct flags are set. +Set COMPILER to change flags (defaulting to GNU). Available compilers are: GNU CLANG INTEL CRAY + endef $(info $(compiler_help)) +COMPILER=GNU endif -COMPILER_ = $(CXX) COMPILER_GNU = g++ COMPILER_CLANG = clang++ COMPILER_INTEL = icpc COMPILER_CRAY = CC +CXX = $(COMPILER_$(COMPILER)) FLAGS_ = -O3 -std=c++11 FLAGS_GNU = -O3 -std=c++11 @@ -29,7 +31,7 @@ else endif ocl-stream: main.cpp OCLStream.cpp - $(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@ + $(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@ .PHONY: clean clean: From 050a27ca833345f73f50182d7d35c7bd1eb9a87b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 24 Feb 2017 17:37:30 +0000 Subject: [PATCH 52/76] Add XL compiler support to OpenMP and RAJA makefiles --- OpenMP.make | 5 ++++- RAJA.make | 13 ++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/OpenMP.make b/OpenMP.make index 32f46ca..28b5326 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -3,7 +3,7 @@ ifndef COMPILER define compiler_help Set COMPILER to change flags (defaulting to GNU). Available compilers are: - CLANG CRAY GNU INTEL + CLANG CRAY GNU INTEL XL endef $(info $(compiler_help)) @@ -25,12 +25,14 @@ COMPILER_GNU = g++ COMPILER_INTEL = icpc COMPILER_CRAY = CC COMPILER_CLANG = clang++ +COMPILER_XL = xlc++ CXX = $(COMPILER_$(COMPILER)) FLAGS_GNU = -O3 -std=c++11 FLAGS_INTEL = -O3 -std=c++11 -xHOST FLAGS_CRAY = -O3 -hstd=c++11 FLAGS_CLANG = -O3 -std=c++11 +FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11 CXXFLAGS = $(FLAGS_$(COMPILER)) # OpenMP flags for CPUs @@ -38,6 +40,7 @@ OMP_GNU_CPU = -fopenmp OMP_INTEL_CPU = -qopenmp OMP_CRAY_CPU = -homp OMP_CLANG_CPU = -fopenmp=libomp +OMP_XL_CPU = -qsmp=omp -qthreaded # OpenMP flags for NVIDIA OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU diff --git a/RAJA.make b/RAJA.make index bba45f5..01e807a 100644 --- a/RAJA.make +++ b/RAJA.make @@ -12,25 +12,28 @@ endif ifeq ($(TARGET), CPU) COMP=$(CXX) -CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU +CXXFLAGS = -DRAJA_TARGET_CPU ifndef COMPILER define compiler_help Set COMPILER to ensure correct OpenMP flags are set. Available compilers are: - INTEL GNU CRAY + INTEL GNU CRAY XL endef $(info $(compiler_help)) endif ifeq ($(COMPILER), INTEL) COMP = icpc -CXXFLAGS += -qopenmp +CXXFLAGS += -O3 -std=c++11 -qopenmp else ifeq ($(COMPILER), GNU) COMP = g++ -CXXFLAGS += -fopenmp +CXXFLAGS += -O3 -std=c++11 -fopenmp else ifeq ($(COMPILER), CRAY) COMP = CC -CXXFLAGS += +CXXFLAGS += -O3 -hstd=c++11 +else ifeq ($(COMPILER), XL) +COMP = xlc++ +CXXFLAGS += -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded endif else ifeq ($(TARGET), GPU) From 24167272398f8b1ce5bc6cfb466660000d9ae23e Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 24 Feb 2017 22:28:16 +0000 Subject: [PATCH 53/76] Refactor compiler flag handling in RAJA Makefile --- RAJA.make | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/RAJA.make b/RAJA.make index 01e807a..7fb821b 100644 --- a/RAJA.make +++ b/RAJA.make @@ -11,33 +11,32 @@ TARGET=CPU endif ifeq ($(TARGET), CPU) -COMP=$(CXX) -CXXFLAGS = -DRAJA_TARGET_CPU ifndef COMPILER define compiler_help -Set COMPILER to ensure correct OpenMP flags are set. +Set COMPILER to change flags (defaulting to GNU). Available compilers are: INTEL GNU CRAY XL endef $(info $(compiler_help)) -endif -ifeq ($(COMPILER), INTEL) -COMP = icpc -CXXFLAGS += -O3 -std=c++11 -qopenmp -else ifeq ($(COMPILER), GNU) -COMP = g++ -CXXFLAGS += -O3 -std=c++11 -fopenmp -else ifeq ($(COMPILER), CRAY) -COMP = CC -CXXFLAGS += -O3 -hstd=c++11 -else ifeq ($(COMPILER), XL) -COMP = xlc++ -CXXFLAGS += -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded +COMPILER=GNU endif +CXX_INTEL = icpc +CXX_GNU = g++ +CXX_CRAY = CC +CXX_XL = xlc++ + +CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp +CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp +CXXFLAGS_CRAY = -O3 -hstd=c++11 +CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded + +CXX = $(CXX_$(COMPILER)) +CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER)) + else ifeq ($(TARGET), GPU) -COMP = nvcc +CXX = nvcc ifndef ARCH define arch_help @@ -51,7 +50,7 @@ CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch endif raja-stream: main.cpp RAJAStream.cpp - $(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ + $(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@ .PHONY: clean clean: From cc90cefeeba426187a18ea49a6db15531ba308c7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Sat, 25 Feb 2017 14:14:59 +0000 Subject: [PATCH 54/76] Minor version bump to signal build system update --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 2b6e459..e6f2cbd 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "3.0" +#define VERSION_STRING "3.1" #include "Stream.h" From ee7cd066ac50a6a270abc4669a874ba44a946383 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 10:03:23 +0100 Subject: [PATCH 55/76] renamed HIPStream implementation --- HIPStream.cu => HIPStream.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) rename HIPStream.cu => HIPStream.cpp (93%) diff --git a/HIPStream.cu b/HIPStream.cpp similarity index 93% rename from HIPStream.cu rename to HIPStream.cpp index d14fe84..edbcac1 100644 --- a/HIPStream.cu +++ b/HIPStream.cpp @@ -1,3 +1,5 @@ + + // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // @@ -72,14 +74,19 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const int device_index) template HIPStream::~HIPStream() { + free(sums); + hipFree(d_a); check_error(); hipFree(d_b); check_error(); hipFree(d_c); check_error(); + hipFree(d_sum); + check_error(); } + template __global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC) { @@ -177,22 +184,22 @@ void HIPStream::triad() check_error(); } - template __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size) { - extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + //extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + HIP_DYNAMIC_SHARED(unsigned char,smem); T *tb_sum = reinterpret_cast(smem); - int i = blockDim.x * blockIdx.x + threadIdx.x; - const size_t local_i = threadIdx.x; + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t local_i = hipThreadIdx_x; tb_sum[local_i] = 0.0; - for (; i < array_size; i += blockDim.x*gridDim.x) + for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) tb_sum[local_i] += a[i] * b[i]; - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) { __syncthreads(); if (local_i < offset) @@ -202,7 +209,7 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, } if (local_i == 0) - sum[blockIdx.x] = tb_sum[local_i]; + sum[hipBlockIdx_x] = tb_sum[local_i]; } template From 350a151c3b05ac251137d49adc2c18ebb3f08880 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 10:04:36 +0100 Subject: [PATCH 56/76] removed CUDA_PATH sentinel from HIP.make --- HIP.make | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/HIP.make b/HIP.make index bbd142e..35b0a6a 100644 --- a/HIP.make +++ b/HIP.make @@ -3,13 +3,7 @@ HIPCC = hipcc -ifndef CUDA_PATH -ifeq (,$(wildcard /usr/local/bin/nvcc)) -$(error /usr/local/bin/nvcc not found, set CUDA_PATH instead) -endif -endif - -hip-stream: main.cpp HIPStream.cu +hip-stream: main.cpp HIPStream.cpp $(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@ .PHONY: clean From ceada6922f19449af7b64d3d0ded84dd4188afa2 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 10:07:48 +0100 Subject: [PATCH 57/76] proper declaration of tb_sum with HIP_DYNAMIC_SHARED macro --- HIPStream.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HIPStream.cpp b/HIPStream.cpp index edbcac1..648b62a 100644 --- a/HIPStream.cpp +++ b/HIPStream.cpp @@ -188,9 +188,9 @@ template __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size) { - //extern __shared__ __align__(sizeof(T)) unsigned char smem[]; - HIP_DYNAMIC_SHARED(unsigned char,smem); - T *tb_sum = reinterpret_cast(smem); + HIP_DYNAMIC_SHARED(T,tb_sum); + // HIP_DYNAMIC_SHARED(unsigned char,smem); + // T *tb_sum = reinterpret_cast(smem); int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t local_i = hipThreadIdx_x; From 58773a79b747b21860101d22af1d53c4f1e19186 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 13:33:21 +0100 Subject: [PATCH 58/76] removed extra lines introduced by hipify, removed obsolete commented code --- HIPStream.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/HIPStream.cpp b/HIPStream.cpp index 648b62a..dafe2cd 100644 --- a/HIPStream.cpp +++ b/HIPStream.cpp @@ -1,5 +1,3 @@ - - // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // @@ -189,8 +187,6 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, { HIP_DYNAMIC_SHARED(T,tb_sum); - // HIP_DYNAMIC_SHARED(unsigned char,smem); - // T *tb_sum = reinterpret_cast(smem); int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t local_i = hipThreadIdx_x; From e570b458a604d3c44bbb9588552626a5250b5f82 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 13:43:57 +0100 Subject: [PATCH 59/76] replaced - for = so that assignment takes place --- KOKKOSStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 9391a13..9abcf87 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -42,7 +42,7 @@ void KOKKOSStream::init_arrays(T initA, T initB, T initC) parallel_for(array_size, KOKKOS_LAMBDA (const int index) { a[index] = initA; - b[index] - initB; + b[index] = initB; c[index] = initC; }); Kokkos::fence(); From 94e0900377e0cd49531042dd07c9134ae91999f2 Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 28 Feb 2017 13:24:45 +0000 Subject: [PATCH 60/76] Use static shared memory in dot for CUDA and HIP --- CUDAStream.cu | 6 ++---- HIPStream.cpp | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 7b1e0df..9588456 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -182,9 +182,7 @@ void CUDAStream::triad() template __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size) { - - extern __shared__ __align__(sizeof(T)) unsigned char smem[]; - T *tb_sum = reinterpret_cast(smem); + __shared__ T tb_sum[TBSIZE]; int i = blockDim.x * blockIdx.x + threadIdx.x; const size_t local_i = threadIdx.x; @@ -209,7 +207,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, d_sum, array_size); check_error(); cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); diff --git a/HIPStream.cpp b/HIPStream.cpp index dafe2cd..7bf724a 100644 --- a/HIPStream.cpp +++ b/HIPStream.cpp @@ -185,8 +185,7 @@ void HIPStream::triad() template __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size) { - - HIP_DYNAMIC_SHARED(T,tb_sum); + __shared__ T tb_sum[TBSIZE]; int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const size_t local_i = hipThreadIdx_x; @@ -211,7 +210,7 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, template T HIPStream::dot() { - hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), sizeof(T)*TBSIZE, 0, d_a, d_b, d_sum, array_size); + hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); check_error(); hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); From ea12f2a9a13041ad114b3828ef51e2e17aa12e88 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 13 Mar 2017 14:41:16 +0100 Subject: [PATCH 61/76] added EXTRA_FLAGS variable to CUDA Makefile to provide the freedom to specify debug flags or gencode flags --- CUDA.make | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CUDA.make b/CUDA.make index ef193df..bf376eb 100644 --- a/CUDA.make +++ b/CUDA.make @@ -1,6 +1,7 @@ +EXTRA_FLAGS?=-O3 cuda-stream: main.cpp CUDAStream.cu - nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@ + nvcc -std=c++11 -DCUDA $^ $(EXTRA_FLAGS) -o $@ .PHONY: clean clean: From 8c7a801a8422a145ced2eb3448eaa016d6a553d4 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 13 Mar 2017 15:22:26 +0100 Subject: [PATCH 62/76] put -O3 into CXXFLAGS to comply with OpenMP.make --- CUDA.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA.make b/CUDA.make index bf376eb..50f175c 100644 --- a/CUDA.make +++ b/CUDA.make @@ -1,7 +1,7 @@ -EXTRA_FLAGS?=-O3 +CXXFLAGS?=-O3 -std=c++11 cuda-stream: main.cpp CUDAStream.cu - nvcc -std=c++11 -DCUDA $^ $(EXTRA_FLAGS) -o $@ + nvcc $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@ .PHONY: clean clean: From 4f288ddc3d171ed2b1a77c238ca7219e0301709b Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 13 Mar 2017 17:15:10 +0000 Subject: [PATCH 63/76] [OpenMP] Add -qopt-streaming-stores for Intel --- OpenMP.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenMP.make b/OpenMP.make index 28b5326..2028c46 100644 --- a/OpenMP.make +++ b/OpenMP.make @@ -29,7 +29,7 @@ COMPILER_XL = xlc++ CXX = $(COMPILER_$(COMPILER)) FLAGS_GNU = -O3 -std=c++11 -FLAGS_INTEL = -O3 -std=c++11 -xHOST +FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always FLAGS_CRAY = -O3 -hstd=c++11 FLAGS_CLANG = -O3 -std=c++11 FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11 From 703eb945a2ebd4bfc031b818776cab3e2a216cb0 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 13 Mar 2017 17:17:20 +0000 Subject: [PATCH 64/76] [OpenMP] Align memory (2MB by default) --- OMPStream.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/OMPStream.cpp b/OMPStream.cpp index 189cacb..06346e7 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -7,6 +7,10 @@ #include "OMPStream.h" +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + template OMPStream::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) { @@ -22,9 +26,9 @@ OMPStream::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev {} #else // Allocate on the host - this->a = (T*)malloc(sizeof(T)*array_size); - this->b = (T*)malloc(sizeof(T)*array_size); - this->c = (T*)malloc(sizeof(T)*array_size); + this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #endif } From d8cb7494e0c8e7beddaffadad97f0c7c0238ca71 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 17 Mar 2017 15:18:13 +0100 Subject: [PATCH 65/76] pulled -O3 out into CXXFLAGS, refactored CUDA compiler into CUDA_CXX make variable to cope with clang as CUDA compiler as well --- CUDA.make | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CUDA.make b/CUDA.make index 50f175c..3edf0f5 100644 --- a/CUDA.make +++ b/CUDA.make @@ -1,7 +1,8 @@ -CXXFLAGS?=-O3 -std=c++11 +CXXFLAGS=-O3 +CUDA_CXX=nvcc cuda-stream: main.cpp CUDAStream.cu - nvcc $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@ + $(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@ .PHONY: clean clean: From d9dfc3f552706cf38beb25a2bf2271e03adb4340 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 21:57:55 +0100 Subject: [PATCH 66/76] [Kokkos] Use long for iterator variable --- KOKKOSStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 9abcf87..45f4dff 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -39,7 +39,7 @@ void KOKKOSStream::init_arrays(T initA, T initB, T initC) View a(*d_a); View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = initA; b[index] = initB; @@ -70,7 +70,7 @@ void KOKKOSStream::copy() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index]; }); @@ -85,7 +85,7 @@ void KOKKOSStream::mul() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void KOKKOSStream::add() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index] + b[index]; }); @@ -114,7 +114,7 @@ void KOKKOSStream::triad() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = b[index] + scalar*c[index]; }); @@ -129,7 +129,7 @@ T KOKKOSStream::dot() T sum = 0.0; - parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp) + parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp) { tmp += a[index] * b[index]; }, sum); From d7e38c1ca9d30e2b73cc7c4aa5b6b69f92269822 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 22:09:58 +0100 Subject: [PATCH 67/76] Add Kokkos build instructions to README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 1e2792d..1efd443 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,18 @@ Pass in extra flags via the `EXTRA_FLAGS` option. The binaries are named in the form `-stream`. +Building Kokkos +--------------- + +We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`. +``` +../generate_makefile.bash --prefix= --with-openmp --with-pthread --arch= --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152 +``` +For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`. +``` +../generate_makefile.bash --prefix= --with-cuda --with-openmp --with-pthread --arch= --with-cuda-options=enable_lambda +``` + Results ------- From d7a93be73972b627b68370487838501e09f31ba4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 22:23:27 +0100 Subject: [PATCH 68/76] [Kokkos] Add a COMPILER option to Makefile, which turns on streaming stores for Intel --- Kokkos.make | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Kokkos.make b/Kokkos.make index 4bba1c8..1c6207d 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -3,6 +3,21 @@ default: kokkos-stream include $(KOKKOS_PATH)/Makefile.kokkos +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU INTEL + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +COMPILER_GNU = g++ +COMPILER_INTEL = icpc -qopt-streaming-stores=always +CXX = $(COMPILER_$(COMPILER)) + ifndef TARGET define target_help Set TARGET to change to offload device. Defaulting to CPU. From 1eb75f034a8af3af53852b9e791b7cb91b6274a0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:02:25 +0100 Subject: [PATCH 69/76] [RAJA] Use xHost and streaming stores with the Intel compiler --- RAJA.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RAJA.make b/RAJA.make index 7fb821b..47aeefb 100644 --- a/RAJA.make +++ b/RAJA.make @@ -27,7 +27,7 @@ CXX_GNU = g++ CXX_CRAY = CC CXX_XL = xlc++ -CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp +CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp CXXFLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded From 1bd4adfe7bad207deac51aae40f370dc4018953d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:14:51 +0100 Subject: [PATCH 70/76] [RAJA] Align the memory to 2MB pages --- RAJAStream.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 240f160..8d3f365 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -10,6 +10,10 @@ using RAJA::forall; using RAJA::RangeSegment; +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + template RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) : array_size(ARRAY_SIZE) @@ -18,9 +22,9 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) index_set.push_back(seg); #ifdef RAJA_TARGET_CPU - d_a = new T[ARRAY_SIZE]; - d_b = new T[ARRAY_SIZE]; - d_c = new T[ARRAY_SIZE]; + d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -33,9 +37,9 @@ template RAJAStream::~RAJAStream() { #ifdef RAJA_TARGET_CPU - delete[] d_a; - delete[] d_b; - delete[] d_c; + free(d_a); + free(d_b); + free(d_c); #else cudaFree(d_a); cudaFree(d_b); From 5f9b288570cb148245715deb8c3e5b62ba5f72b6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:15:11 +0100 Subject: [PATCH 71/76] [RAJA] Declare pointers using RAJA_RESTRICT --- RAJAStream.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 8d3f365..6c6098e 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -50,9 +50,9 @@ RAJAStream::~RAJAStream() template void RAJAStream::init_arrays(T initA, T initB, T initC) { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { a[index] = initA; @@ -73,8 +73,8 @@ void RAJAStream::read_arrays( template void RAJAStream::copy() { - T* a = d_a; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index]; @@ -84,8 +84,8 @@ void RAJAStream::copy() template void RAJAStream::mul() { - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { @@ -96,9 +96,9 @@ void RAJAStream::mul() template void RAJAStream::add() { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index] + b[index]; @@ -108,9 +108,9 @@ void RAJAStream::add() template void RAJAStream::triad() { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { @@ -121,8 +121,8 @@ void RAJAStream::triad() template T RAJAStream::dot() { - T* a = d_a; - T* b = d_b; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; RAJA::ReduceSum sum(0.0); From 3331f62f42eb89741f5ed01a393f3492a1331ebc Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:16:34 +0100 Subject: [PATCH 72/76] Add RAJA build instructions to README --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 1efd443..938cb81 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,14 @@ For building with CUDA support, we use the following command, specifying the `ar ../generate_makefile.bash --prefix= --with-cuda --with-openmp --with-pthread --arch= --with-cuda-options=enable_lambda ``` +Building RAJA +------------- + +We use the following command to build RAJA using the Intel Compiler. +``` +cmake ../../ -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +``` + Results ------- From c534600d04a78a3ec77e1818a291d6cae728b7c3 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:36:01 +0100 Subject: [PATCH 73/76] [RAJA] Use Index_type for iterator index type instead of hardcoding int --- RAJAStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 6c6098e..395a6ee 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -53,7 +53,7 @@ void RAJAStream::init_arrays(T initA, T initB, T initC) T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = initA; b[index] = initB; @@ -75,7 +75,7 @@ void RAJAStream::copy() { T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index]; }); @@ -87,7 +87,7 @@ void RAJAStream::mul() T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void RAJAStream::add() T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index] + b[index]; }); @@ -112,7 +112,7 @@ void RAJAStream::triad() T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = b[index] + scalar*c[index]; }); @@ -126,7 +126,7 @@ T RAJAStream::dot() RAJA::ReduceSum sum(0.0); - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { sum += a[index] * b[index]; }); From 50e3a1970fcd9712ca6fb7513233b6e4e6ef9ca5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:38:03 +0100 Subject: [PATCH 74/76] Add RAJA CUDA build instructions --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 938cb81..e1dcfdc 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,12 @@ Building RAJA We use the following command to build RAJA using the Intel Compiler. ``` -cmake ../../ -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +cmake .. -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +``` +For building with CUDA support, we use the following command. +``` +cmake .. -DCMAKE_INSTALL_PREFIX= -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off ``` - Results ------- From 9c08fdd18411b6adca8b2c94cae3e0ebaedc3845 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:38:48 +0100 Subject: [PATCH 75/76] Minor version bump --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index e6f2cbd..2c33f22 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "3.1" +#define VERSION_STRING "3.2" #include "Stream.h" From dafc63030ff916b613be39916beccd20a0327583 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Sat, 8 Apr 2017 12:16:29 +0100 Subject: [PATCH 76/76] Rename to BabelStream --- LICENSE | 14 +++++++------- README.md | 14 ++++++++++---- main.cpp | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/LICENSE b/LICENSE index 1bc1114..1835c1a 100644 --- a/LICENSE +++ b/LICENSE @@ -12,22 +12,22 @@ * 3. You are free to publish results obtained from running this * program, or from works that you derive from this program, * with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM +* 3a. In order to be referred to as "BabelStream benchmark results", +* published results must be in conformance to the BabelStream * Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules * and incorporated herein by reference. * The copyright holders retain the * right to determine conformity with the Run Rules. * 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly +* accordance with the BabelStream Run Rules must be clearly * labelled whenever they are published. Examples of * proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" +* "tuned BabelStream benchmark results" +* "based on a variant of the BabelStream benchmark code" * Other comparable, clear and reasonable labelling is * acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site +* 3c. Submission of results to the BabelStream benchmark web site * is encouraged, but not required. * 4. Use of this program or creation of derived works based on this * program constitutes acceptance of these licensing restrictions. diff --git a/README.md b/README.md index e1dcfdc..0f93be6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -GPU-STREAM +BabelStream ========== Measure memory transfer rates to/from global device memory on GPUs. @@ -16,9 +16,11 @@ Currently implemented are: - RAJA - SYCL +This code was previously called GPU-STREAM. + Website ------- -[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/) +[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/) Usage ----- @@ -68,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to Citing ------ -You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference: +Please cite BabelStream via this reference: > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. -**Other GPU-STREAM publications:** +**Other BabelStream publications:** > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. +You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). + +> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. +You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). diff --git a/main.cpp b/main.cpp index 2c33f22..33cef1e 100644 --- a/main.cpp +++ b/main.cpp @@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]); int main(int argc, char *argv[]) { std::cout - << "GPU-STREAM" << std::endl + << "BabelStream" << std::endl << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl;