Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Omp offload #165

Open
wants to merge 24 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
597c29f
create a new branch for omp target offload version
mathialakan Oct 6, 2021
6adc915
create a new branch for omp target offload version
mathialakan Oct 6, 2021
df2300e
Merge branch 'ccsb-scripps:develop' into omp_offload
mathialakan Oct 6, 2021
cff75bf
omp offload code for deepcopying and main algm
mathialakan Oct 7, 2021
ab116b9
add changes
mathialakan Oct 7, 2021
193df1c
omp offload gpu kernels
mathialakan Oct 7, 2021
62fec6a
changes on calcenergy
mathialakan Oct 9, 2021
f93ea83
pass docpars separately
mathialakan Oct 13, 2021
87f6b48
fix bugs and include compiler script
mathialakan Oct 26, 2021
51675db
set device based on user choice
mathialakan Oct 26, 2021
278927c
add a test case for file-list
mathialakan Oct 27, 2021
5b6c555
changes to use multi GPUs
mathialakan Oct 27, 2021
824096b
use 3 GPUs
mathialakan Oct 27, 2021
4bd257e
cleanup code
mathialakan Oct 28, 2021
ec90d47
cleanup code
mathialakan Oct 28, 2021
762ce82
use map_to instead of map_aloc + update
mathialakan Nov 3, 2021
3a84b7d
add task to gpu scheduling strategies
mathialakan Nov 4, 2021
e6eed15
add task loop
mathialakan Nov 4, 2021
a420f19
Use the approach of parallel construct nested to teams region
mathialakan Nov 10, 2021
6ec153d
rearrange the code
mathialakan Nov 15, 2021
17b023e
include local search method ADADELTA
mathialakan Nov 15, 2021
5bf663b
resolve warnings
mathialakan Nov 15, 2021
82a7635
fix bugs
mathialakan Nov 16, 2021
77c6a80
use parallel approach for kernel1 and 4
mathialakan Apr 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,27 @@
# Valid values: CPU, GPU, CUDA, OCLGPU

ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA))
TEST_CUDA := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)")
TEST_CUDA := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)")
# if user specifies DEVICE=CUDA it will be used (wether the test succeeds or not)
# if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not
ifeq ($(DEVICE)$(TEST_CUDA),GPUyes)
override DEVICE:=CUDA
endif
ifeq ($(DEVICE)$(TEST_CUDA),GPUyes)
override DEVICE:=CUDA
endif
endif
ifeq ($(DEVICE),CUDA)
override DEVICE:=GPU
export
include Makefile.Cuda
override DEVICE:=GPU
export
include Makefile.Cuda
else
ifeq ($(DEVICE), OMPGPU)
override DEVICE:=GPU
export
include Makefile.Ompt
else
ifeq ($(DEVICE),OCLGPU)
override DEVICE:=GPU
export
override DEVICE:=GPU
export
include Makefile.OpenCL
endif
endif
include Makefile.OpenCL
endif
215 changes: 215 additions & 0 deletions Makefile.Ompt
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#
# miniMDock OpenMP Target Offloading Makefile
# ------------------------------------------------------
# Note that environment variables must be defined
# before compiling
# DEVICE?

UNAME := $(shell uname)

#$(shell ./link_ompt.sh)
#COMPILER=llvm

#Using LLVM clang compiler
ifeq ($(COMPILER), llvm)
CPP=clang++ -Ofast -std=c++17 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=$(GPU_PATH) -Xcuda-ptxas --maxrregcount=120
##CPP=clang++ -Ofast -std=c++17 -fopenmp -lopmtarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=$(GPU_PATH)
COMP=llvm
#Using IBM XL Compiler
else ifeq ($(COMPILER), xl)
CPP=xlC -std=c++14 -qsmp=omp -qoffload -O3 -qstrict
COMP=xl
#Using NVIDIA nvc++ compiler
else ifeq ($(COMPILER), nvhpc)
CPP=nvc++ -std=c++17 -mp=gpu -fast -gpu=cc70
COMP=nvhpc
#Using gcc
else ifeq ($(COMPILER), gcc)
CPP=g++ -fopenmp -foffload=-lm -O2 -std=c++17
COMP=gcc
else ifeq ($(COMPILER), CC)
CPP=CC -fopenmp
COMP=cce
else ifeq ($(COMPILER), ROCM)
CPP=clang++ -fopenmp -target x86_64-pc-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
COMP=rocm
endif

# ------------------------------------------------------
# Project directories
COMMON_DIR=./common
HOST_INC_DIR=./host/inc
HOST_SRC_DIR=./host/src
KRNL_INC_DIR=./omp_offload/inc
KRNL_SRC_DIR=./omp_offload/src
BIN_DIR=./bin

# Host sources
HOST_SRC=$(wildcard $(HOST_SRC_DIR)/*.cpp)
KRNL_SRC=$(wildcard $(KRNL_SRC_DIR)/*.cpp)
SRC=$(HOST_SRC) $(KRNL_SRC)

IFLAGS=-I$(COMMON_DIR) -I$(HOST_INC_DIR) -I$(KRNL_INC_DIR)
#LFLAGS= -lomptarget
CFLAGS=$(IFLAGS) $(LFLAGS)

OMPT_OPTS=-DUSE_OMPT
TARGET := autodock_ompt

ifeq ($(CARD), AMD)
CD=-DCARD_AMD
TARGET:=$(TARGET)_amd
else
TARGET:=$(TARGET)_nv
endif
TARGET:=$(TARGET)_$(COMP)

ifeq ($(OVERLAP), ON)
ifeq ($(DEVICE), GPU)
OMPT_OPTS+=-DUSE_OMP
endif
endif

ifeq ($(TASK2GPU), YES)
OMPT_OPTS+=-DTASKTOGPU
endif

BIN := $(wildcard $(TARGET)*)

# ------------------------------------------------------
# Number of work-items (wi)
# Valid values: 32, 64, 128, 256
NUMWI=

ifeq ($(NUMWI), 32)
NWI=-DN32WI
TARGET:=$(TARGET)_32wi
else ifeq ($(NUMWI), 64)
NWI=-DN64WI
TARGET:=$(TARGET)_64wi
else ifeq ($(NUMWI), 128)
NWI=-DN128WI
TARGET:=$(TARGET)_128wi
else ifeq ($(NUMWI), 256)
NWI=-DN256WI
TARGET:=$(TARGET)_256wi
else
ifeq ($(DEVICE), CPU)
NWI=-DN16WI
TARGET:=$(TARGET)_16wi
else ifeq ($(DEVICE), GPU)
NWI=-DN64WI
TARGET:=$(TARGET)_64wi
endif
endif

# ------------------------------------------------------
# Configuration
# FDEBUG (full) : enables debugging on both host + device
# LDEBUG (light): enables debugging on host
# RELEASE
CONFIG=RELEASE
#CONFIG=FDEBUG

ifeq ($(CONFIG),FDEBUG)
OPT =-O0 -g -Wall -DDOCK_DEBUG
else ifeq ($(CONFIG),LDEBUG)
OPT =-O0 -g -Wall
else ifeq ($(CONFIG),RELEASE)
OPT =-O3
else
OPT =
endif

# ------------------------------------------------------
# Reproduce results (remove randomness)
REPRO=NO

ifeq ($(REPRO),YES)
REP =-DREPRO
else
REP =
endif
# ------------------------------------------------------

all: odock

check-env-dev:
@if test -z "$$DEVICE"; then \
echo "DEVICE is undefined"; \
exit 1; \
else \
if [ "$$DEVICE" = "GPU" ]; then \
echo "DEVICE is set to $$DEVICE"; \
else \
if [ "$$DEVICE" = "CPU" ]; then \
echo "DEVICE is set to $$DEVICE"; \
else \
if [ "$$DEVICE" = "SERIAL" ]; then \
echo "DEVICE is set to $$DEVICE"; \
else \
echo "DEVICE value is invalid. Set DEVICE to either CPU, GPU, or SERIAL (1 thread on CPU)"; \
fi; \
fi; \
if [ "$$OVERLAP" = "ON" ]; then \
echo "OVERLAP only works with the GPU version right now"; \
exit 1; \
fi; \
fi; \
fi; \
echo " "

check-env-cpu:
@if test -z "$$CPU_INCLUDE_PATH"; then \
echo "CPU_INCLUDE_PATH is undefined"; \
else \
echo "CPU_INCLUDE_PATH is set to $$CPU_INCLUDE_PATH"; \
fi; \
if test -z "$$CPU_LIBRARY_PATH"; then \
echo "CPU_LIBRARY_PATH is undefined"; \
else \
echo "CPU_LIBRARY_PATH is set to $$CPU_LIBRARY_PATH"; \
fi; \
echo " "

check-env-gpu:
@if test -z "$$GPU_INCLUDE_PATH"; then \
echo "GPU_INCLUDE_PATH is undefined"; \
else \
echo "GPU_INCLUDE_PATH is set to $$GPU_INCLUDE_PATH"; \
fi; \
if test -z "$$GPU_LIBRARY_PATH"; then \
echo "GPU_LIBRARY_PATH is undefined"; \
else \
echo "GPU_LIBRARY_PATH is set to $$GPU_LIBRARY_PATH"; \
fi; \
echo " "

check-env-all: check-env-dev check-env-cpu check-env-gpu

# ------------------------------------------------------
# Priting out its git version hash

GIT_VERSION := $(shell git describe --abbrev=40 --dirty --always --tags)

CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"

# ------------------------------------------------------
link-code:
ln -sf performdocking.h.Omp $(HOST_INC_DIR)/performdocking.h
ln -sf performdocking.cpp.Omp $(HOST_SRC_DIR)/performdocking.cpp

unlink-code:
rm -f $(HOST_INC_DIR)/performdocking.h $(HOST_SRC_DIR)/performdocking.cpp

#odock: check-env-all $(HOST_SRC) $(KRNL_SRC)
odock: check-env-all $(SRC) link-code
$(CPP) \
$(SRC) \
$(CFLAGS) \
-o $(BIN_DIR)/$(TARGET) \
$(NWI) $(OPT) $(DD) $(REP) $(CD) $(OMPT_OPTS)


clean:
rm -f $(BIN_DIR)/* initpop.txt
97 changes: 62 additions & 35 deletions common/calcenergy_basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,70 +28,97 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

#include "defines.h"

#define RLIST_ATOMID_MASK 0x000000FF
#define RLIST_RBONDID_MASK 0x0000FF00
#define RLIST_RBONDID_SHIFT 8
#define RLIST_FIRSTROT_MASK 0x00010000
#define RLIST_GENROT_MASK 0x00020000
#define RLIST_DUMMY_MASK 0x00040000
#define RLIST_ATOMID_MASK 0x000000FF
#define RLIST_RBONDID_MASK 0x0000FF00
#define RLIST_RBONDID_SHIFT 8
#define RLIST_FIRSTROT_MASK 0x00010000
#define RLIST_GENROT_MASK 0x00020000
#define RLIST_DUMMY_MASK 0x00040000

#define DEG_TO_RAD 0.0174533f
#define DEG_TO_RAD 0.0174533f

// LCG: linear congruential generator constants
#define RAND_A 1103515245u
#define RAND_C 12345u
#define RAND_A 1103515245u
#define RAND_C 12345u
// WARNING: it is supposed that unsigned int is 32 bit long
#define MAX_UINT 4294967296.0f
#define MAX_UINT 4294967296.0f

// Sticking to array boundaries
#define stick_to_bounds(x,a,b) x + (x <= a)*(a-x) + (x >= b)*(b-x)

// e^2/4pie0 in kcal/mol
#define ELEC_SCALE_FACTOR 332.06363f

// Constants for dielelectric term of the
// Macro for capturing grid values
// Original
#define GETGRIDVALUE(mempoi,gridsize_x,gridsize_y,gridsize_z,t,z,y,x) *(mempoi + gridsize_x*(y + gridsize_y*(z + gridsize_z*t)) + x)

// Optimization 1
// #define GETGRIDVALUE_OPT(mempoi,gridsize_x,gridsize_y,mul_tmp,z,y,x) *(mempoi + gridsize_x*(y + gridsize_y*(z + mul_tmp)) + x)

// Optimization 2
// Implemented direclty in the kernel code: calcenergy_fourkernels_intel.cl


// Macro for trilinear interpolation
#define TRILININTERPOL(cube, weights) (cube[idx_000]*weights[idx_000] + \
cube[idx_010]*weights[idx_010] + \
cube[idx_001]*weights[idx_001] + \
cube[idx_011]*weights[idx_011] + \
cube[idx_100]*weights[idx_100] + \
cube[idx_110]*weights[idx_110] + \
cube[idx_101]*weights[idx_101] + \
cube[idx_111]*weights[idx_111])

// Sticking to array boundaries
#define stick_to_bounds(x,a,b) x + (x <= a)*(a-x) + (x >= b)*(b-x)

// Constants for dielelectric term of the
// electrostatic component of the intramolecular energy/gradient
#define DIEL_A -8.5525f
#define DIEL_WAT 78.4f
#define DIEL_B (DIEL_WAT - DIEL_A)
#define DIEL_LAMBDA 0.003627f
#define DIEL_H DIEL_LAMBDA
#define DIEL_K 7.7839f
#define DIEL_B_TIMES_H (DIEL_B * DIEL_H)
#define DIEL_B_TIMES_H_TIMES_K (DIEL_B_TIMES_H * DIEL_K)
#define DIEL_A -8.5525f
#define DIEL_WAT 78.4f
#define DIEL_B (DIEL_WAT - DIEL_A)
#define DIEL_LAMBDA 0.003627f
#define DIEL_H DIEL_LAMBDA
#define DIEL_K 7.7839f
#define DIEL_B_TIMES_H (DIEL_B * DIEL_H)
#define DIEL_B_TIMES_H_TIMES_K (DIEL_B_TIMES_H * DIEL_K)

// Used for Shoemake to quaternion transformation
#if defined(M_PI)
#define PI_FLOAT (float)(M_PI)
#define PI_FLOAT (float)(M_PI)
#else
#define PI_FLOAT 3.14159265359f
#define PI_FLOAT 3.14159265359f
#endif
#define PI_TIMES_2 2.0f*PI_FLOAT
#define PI_TIMES_2 2.0f*PI_FLOAT

// -------------------------------------------
// Gradient-related defines
// -------------------------------------------

#define INFINITESIMAL_RADIAN 1E-3f
#define HALF_INFINITESIMAL_RADIAN (float)(0.5f * INFINITESIMAL_RADIAN)
#define INV_INFINITESIMAL_RADIAN (1.0f/INFINITESIMAL_RADIAN)
#define COS_HALF_INFINITESIMAL_RADIAN cos(HALF_INFINITESIMAL_RADIAN)
#define SIN_HALF_INFINITESIMAL_RADIAN sin(HALF_INFINITESIMAL_RADIAN)
#define inv_angle_delta 500.0f / PI_FLOAT
#define INFINITESIMAL_RADIAN 1E-3
#define HALF_INFINITESIMAL_RADIAN (float)(0.5f * INFINITESIMAL_RADIAN)
#define INV_INFINITESIMAL_RADIAN (1/INFINITESIMAL_RADIAN)
#define COS_HALF_INFINITESIMAL_RADIAN cos(HALF_INFINITESIMAL_RADIAN)
#define SIN_HALF_INFINITESIMAL_RADIAN sin(HALF_INFINITESIMAL_RADIAN)
#define inv_angle_delta 500.0f / PI_FLOAT

/*
#define TRANGENE_ALPHA 1E-3
#define ROTAGENE_ALPHA 1E-8
#define TORSGENE_ALPHA 1E-13
*/

#define STEP_INCREASE 1.2f
#define STEP_DECREASE 0.2f
#define STEP_START 1000.0f // Starting step size. This might look gigantic but will cap
#define MAX_DEV_TRANSLATION 2.0f // 2 Angstrom, but must be divided by the gridspacing (store in variable)
//#define MAX_DEV_ROTATION 0.2f // Shoemake range [0, 1]
#define MAX_DEV_ROTATION 0.5f/DEG_TO_RAD // 0.5f RAD
#define MAX_DEV_TORSION 0.5f/DEG_TO_RAD // 0.5f RAD
#define STEP_INCREASE 1.2f
#define STEP_DECREASE 0.2f
#define STEP_START 1E3 // Starting step size. This might look gigantic but will cap
#define MAX_DEV_TRANSLATION 2.0f // 2 Angstrom, but must be divided by the gridspacing (store in variable)
//#define MAX_DEV_ROTATION 0.2f // Shoemake range [0, 1]
#define MAX_DEV_ROTATION 0.5f/DEG_TO_RAD // 0.5f RAD
#define MAX_DEV_TORSION 0.5f/DEG_TO_RAD // 0.5f RAD





#endif /* CALCENERGY_BASIC_H_ */
2 changes: 1 addition & 1 deletion common/defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ enum {C=0,N=1,O=2,H=3,XX=4,P=5,S=6}; // see "bond_index" in the "AD4.1_bound.da
#define LS_CONT_FACTOR 0.5f

// Improvements over Pechan's implementation
#define MAPPED_COPY
//#define MAPPED_COPY


// Coefficients for CG-G0 pairs used in
Expand Down
Loading